In [10]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
#%matplotlib inline
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)
warnings.filterwarnings('ignore')
In [4]:
df = pd.read_excel('compactiv.xlsx')
In [5]:
df.head()
Out[5]:
| lread | lwrite | scall | sread | swrite | fork | exec | rchar | wchar | pgout | ... | pgscan | atch | pgin | ppgin | pflt | vflt | runqsz | freemem | freeswap | usr | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 2147 | 79 | 68 | 0.2 | 0.2 | 40671.0 | 53995.0 | 0.0 | ... | 0.0 | 0.0 | 1.6 | 2.6 | 16.00 | 26.40 | CPU_Bound | 4670 | 1730946 | 95 |
| 1 | 0 | 0 | 170 | 18 | 21 | 0.2 | 0.2 | 448.0 | 8385.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 15.63 | 16.83 | Not_CPU_Bound | 7278 | 1869002 | 97 |
| 2 | 15 | 3 | 2162 | 159 | 119 | 2.0 | 2.4 | NaN | 31950.0 | 0.0 | ... | 0.0 | 1.2 | 6.0 | 9.4 | 150.20 | 220.20 | Not_CPU_Bound | 702 | 1021237 | 87 |
| 3 | 0 | 0 | 160 | 12 | 16 | 0.2 | 0.2 | NaN | 8670.0 | 0.0 | ... | 0.0 | 0.0 | 0.2 | 0.2 | 15.60 | 16.80 | Not_CPU_Bound | 7248 | 1863704 | 98 |
| 4 | 5 | 1 | 330 | 39 | 38 | 0.4 | 0.4 | NaN | 12185.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 1.2 | 37.80 | 47.60 | Not_CPU_Bound | 633 | 1760253 | 90 |
5 rows × 22 columns
In [6]:
df.tail()
Out[6]:
| lread | lwrite | scall | sread | swrite | fork | exec | rchar | wchar | pgout | ... | pgscan | atch | pgin | ppgin | pflt | vflt | runqsz | freemem | freeswap | usr | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8187 | 16 | 12 | 3009 | 360 | 244 | 1.6 | 5.81 | 405250.0 | 85282.0 | 8.02 | ... | 55.11 | 0.6 | 35.87 | 47.90 | 139.28 | 270.74 | CPU_Bound | 387 | 986647 | 80 |
| 8188 | 4 | 0 | 1596 | 170 | 146 | 2.4 | 1.80 | 89489.0 | 41764.0 | 3.80 | ... | 0.20 | 0.8 | 3.80 | 4.40 | 122.40 | 212.60 | Not_CPU_Bound | 263 | 1055742 | 90 |
| 8189 | 16 | 5 | 3116 | 289 | 190 | 0.6 | 0.60 | 325948.0 | 52640.0 | 0.40 | ... | 0.00 | 0.4 | 28.40 | 45.20 | 60.20 | 219.80 | Not_CPU_Bound | 400 | 969106 | 87 |
| 8190 | 32 | 45 | 5180 | 254 | 179 | 1.2 | 1.20 | 62571.0 | 29505.0 | 1.40 | ... | 18.04 | 0.4 | 23.05 | 24.25 | 93.19 | 202.81 | CPU_Bound | 141 | 1022458 | 83 |
| 8191 | 2 | 0 | 985 | 55 | 46 | 1.6 | 4.80 | 111111.0 | 22256.0 | 0.00 | ... | 0.00 | 0.2 | 3.40 | 6.20 | 91.80 | 110.00 | CPU_Bound | 659 | 1756514 | 94 |
5 rows × 22 columns
In [7]:
df.shape
Out[7]:
(8192, 22)
In [8]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8192 entries, 0 to 8191 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 lread 8192 non-null int64 1 lwrite 8192 non-null int64 2 scall 8192 non-null int64 3 sread 8192 non-null int64 4 swrite 8192 non-null int64 5 fork 8192 non-null float64 6 exec 8192 non-null float64 7 rchar 8088 non-null float64 8 wchar 8177 non-null float64 9 pgout 8192 non-null float64 10 ppgout 8192 non-null float64 11 pgfree 8192 non-null float64 12 pgscan 8192 non-null float64 13 atch 8192 non-null float64 14 pgin 8192 non-null float64 15 ppgin 8192 non-null float64 16 pflt 8192 non-null float64 17 vflt 8192 non-null float64 18 runqsz 8192 non-null object 19 freemem 8192 non-null int64 20 freeswap 8192 non-null int64 21 usr 8192 non-null int64 dtypes: float64(13), int64(8), object(1) memory usage: 1.4+ MB
In [9]:
df.describe().T
Out[9]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| lread | 8192.0 | 1.955969e+01 | 53.353799 | 0.0 | 2.0 | 7.0 | 20.000 | 1845.00 |
| lwrite | 8192.0 | 1.310620e+01 | 29.891726 | 0.0 | 0.0 | 1.0 | 10.000 | 575.00 |
| scall | 8192.0 | 2.306318e+03 | 1633.617322 | 109.0 | 1012.0 | 2051.5 | 3317.250 | 12493.00 |
| sread | 8192.0 | 2.104800e+02 | 198.980146 | 6.0 | 86.0 | 166.0 | 279.000 | 5318.00 |
| swrite | 8192.0 | 1.500582e+02 | 160.478980 | 7.0 | 63.0 | 117.0 | 185.000 | 5456.00 |
| fork | 8192.0 | 1.884554e+00 | 2.479493 | 0.0 | 0.4 | 0.8 | 2.200 | 20.12 |
| exec | 8192.0 | 2.791998e+00 | 5.212456 | 0.0 | 0.2 | 1.2 | 2.800 | 59.56 |
| rchar | 8088.0 | 1.973857e+05 | 239837.493526 | 278.0 | 34091.5 | 125473.5 | 267828.750 | 2526649.00 |
| wchar | 8177.0 | 9.590299e+04 | 140841.707911 | 1498.0 | 22916.0 | 46619.0 | 106101.000 | 1801623.00 |
| pgout | 8192.0 | 2.285317e+00 | 5.307038 | 0.0 | 0.0 | 0.0 | 2.400 | 81.44 |
| ppgout | 8192.0 | 5.977229e+00 | 15.214590 | 0.0 | 0.0 | 0.0 | 4.200 | 184.20 |
| pgfree | 8192.0 | 1.191971e+01 | 32.363520 | 0.0 | 0.0 | 0.0 | 5.000 | 523.00 |
| pgscan | 8192.0 | 2.152685e+01 | 71.141340 | 0.0 | 0.0 | 0.0 | 0.000 | 1237.00 |
| atch | 8192.0 | 1.127505e+00 | 5.708347 | 0.0 | 0.0 | 0.0 | 0.600 | 211.58 |
| pgin | 8192.0 | 8.277960e+00 | 13.874978 | 0.0 | 0.6 | 2.8 | 9.765 | 141.20 |
| ppgin | 8192.0 | 1.238859e+01 | 22.281318 | 0.0 | 0.6 | 3.8 | 13.800 | 292.61 |
| pflt | 8192.0 | 1.097938e+02 | 114.419221 | 0.0 | 25.0 | 63.8 | 159.600 | 899.80 |
| vflt | 8192.0 | 1.853158e+02 | 191.000603 | 0.2 | 45.4 | 120.4 | 251.800 | 1365.00 |
| freemem | 8192.0 | 1.763456e+03 | 2482.104511 | 55.0 | 231.0 | 579.0 | 2002.250 | 12027.00 |
| freeswap | 8192.0 | 1.328126e+06 | 422019.426957 | 2.0 | 1042623.5 | 1289289.5 | 1730379.500 | 2243187.00 |
| usr | 8192.0 | 8.396887e+01 | 18.401905 | 0.0 | 81.0 | 89.0 | 94.000 | 99.00 |
In [10]:
df.duplicated().sum()
Out[10]:
0
In [11]:
df.isnull().sum()
Out[11]:
lread 0 lwrite 0 scall 0 sread 0 swrite 0 fork 0 exec 0 rchar 104 wchar 15 pgout 0 ppgout 0 pgfree 0 pgscan 0 atch 0 pgin 0 ppgin 0 pflt 0 vflt 0 runqsz 0 freemem 0 freeswap 0 usr 0 dtype: int64
In [12]:
for column in df.columns:
counts = df[column].value_counts()
print(f"Value counts for {column}:\n{counts}\n")
Value counts for lread:
lread
1 1050
2 732
0 675
3 539
4 408
...
223 1
254 1
141 1
117 1
129 1
Name: count, Length: 235, dtype: int64
Value counts for lwrite:
lwrite
0 2684
1 1529
2 615
3 284
4 253
...
183 1
138 1
270 1
120 1
267 1
Name: count, Length: 189, dtype: int64
Value counts for scall:
scall
158 10
220 10
419 9
160 9
230 9
..
3362 1
4460 1
4765 1
3868 1
5180 1
Name: count, Length: 4115, dtype: int64
Value counts for sread:
sread
16 43
10 41
43 40
12 38
95 37
..
671 1
867 1
420 1
772 1
674 1
Name: count, Length: 794, dtype: int64
Value counts for swrite:
swrite
30 56
91 56
24 53
118 51
22 50
..
599 1
732 1
419 1
1042 1
612 1
Name: count, Length: 640, dtype: int64
Value counts for fork:
fork
0.20 1999
0.40 966
0.60 716
0.80 563
1.00 398
...
12.38 1
1.78 1
3.56 1
0.59 1
6.37 1
Name: count, Length: 228, dtype: int64
Value counts for exec:
exec
0.20 2060
0.40 595
0.60 571
0.80 453
1.00 344
...
34.33 1
28.34 1
9.15 1
34.67 1
34.47 1
Name: count, Length: 386, dtype: int64
Value counts for rchar:
rchar
452.0 6
6994.0 5
7001.0 5
7018.0 4
425.0 4
..
122096.0 1
23110.0 1
49659.0 1
94575.0 1
111111.0 1
Name: count, Length: 7898, dtype: int64
Value counts for wchar:
wchar
18709.0 4
13554.0 3
21962.0 3
25473.0 3
8482.0 3
..
82665.0 1
158009.0 1
38142.0 1
25607.0 1
22256.0 1
Name: count, Length: 7925, dtype: int64
Value counts for pgout:
pgout
0.00 4878
0.20 140
0.40 140
0.60 135
0.80 126
...
38.00 1
6.61 1
43.60 1
23.20 1
14.74 1
Name: count, Length: 404, dtype: int64
Value counts for ppgout:
ppgout
0.00 4878
0.40 116
0.60 106
0.20 99
0.80 95
...
45.20 1
40.20 1
45.51 1
65.67 1
55.71 1
Name: count, Length: 774, dtype: int64
Value counts for pgfree:
pgfree
0.00 4869
0.40 115
0.20 98
0.60 98
0.80 87
...
73.65 1
131.00 1
96.81 1
62.12 1
13.03 1
Name: count, Length: 1070, dtype: int64
Value counts for pgscan:
pgscan
0.00 6448
0.60 9
1.20 7
2.40 7
27.00 7
...
286.23 1
190.18 1
43.29 1
256.09 1
18.04 1
Name: count, Length: 1202, dtype: int64
Value counts for atch:
atch
0.00 4575
0.20 804
0.40 504
0.60 307
0.80 287
...
98.61 1
41.80 1
18.84 1
2.58 1
8.82 1
Name: count, Length: 253, dtype: int64
Value counts for pgin:
pgin
0.00 1220
0.20 457
0.40 317
0.60 288
0.80 248
...
53.31 1
69.58 1
72.20 1
19.28 1
35.87 1
Name: count, Length: 832, dtype: int64
Value counts for ppgin:
ppgin
0.00 1220
0.20 350
0.40 332
0.80 221
0.60 215
...
79.04 1
71.40 1
16.23 1
84.03 1
47.90 1
Name: count, Length: 1072, dtype: int64
Value counts for pflt:
pflt
15.60 532
15.80 114
15.40 111
16.00 83
15.57 79
...
300.00 1
144.71 1
254.60 1
76.65 1
93.19 1
Name: count, Length: 2987, dtype: int64
Value counts for vflt:
vflt
16.80 412
17.00 95
16.83 67
16.77 53
17.20 50
...
482.57 1
525.00 1
497.01 1
27.05 1
270.74 1
Name: count, Length: 3799, dtype: int64
Value counts for runqsz:
runqsz
Not_CPU_Bound 4331
CPU_Bound 3861
Name: count, dtype: int64
Value counts for freemem:
freemem
132 37
159 31
168 29
136 28
139 28
..
874 1
11640 1
4728 1
6888 1
6210 1
Name: count, Length: 3165, dtype: int64
Value counts for freeswap:
freeswap
11 25
10 23
9 22
12 19
7 19
..
1745791 1
1064513 1
1092566 1
1381216 1
1756514 1
Name: count, Length: 7658, dtype: int64
Value counts for usr:
usr
90 459
91 448
92 426
94 421
93 411
97 410
96 410
95 405
88 384
98 378
89 376
87 338
0 283
86 283
85 254
84 252
83 230
81 201
82 187
80 166
79 150
77 144
78 126
76 119
75 104
74 96
72 77
73 73
99 60
69 51
71 49
68 46
70 42
67 39
66 36
63 32
64 27
62 27
65 25
59 23
60 20
58 17
61 16
57 14
56 11
1 10
55 10
54 7
53 5
50 4
51 4
52 2
49 1
48 1
2 1
46 1
Name: count, dtype: int64
In [13]:
# Select numerical columns
num_columns = df.select_dtypes(include=['int64', 'float64']).columns
# Select categorical columns
cat_columns = df.select_dtypes(include=['object', 'category']).columns
In [867]:
num_columns
Out[867]:
['lread', 'lwrite', 'scall', 'sread', 'swrite', 'fork', 'exec', 'rchar', 'wchar', 'pgout', 'ppgout', 'pgfree', 'pgscan', 'atch', 'pgin', 'ppgin', 'pflt', 'vflt', 'freemem', 'freeswap', 'usr']
In [868]:
cat_columns
Out[868]:
Index(['runqsz'], dtype='object')
Uni-variate Analysis¶
Categorical¶
In [14]:
fig, ax = plt.subplots(figsize=(15, 10))
# Loop through the selected columns and draw countplots on a single set of axes
for column in cat_columns:
sns.countplot(x=column, data=df, ax=ax)
ax.set_title(f'Countplot for {column}')
# Adjust layout to prevent overlapping titles
plt.tight_layout()
# Show the plot
plt.show()
In [15]:
# Loop through the selected columns
for column in cat_columns:
count_info = df[column].value_counts()
print(f'Count information for {column}:\n{count_info}\n{"="*30}\n')
Count information for runqsz: runqsz Not_CPU_Bound 4331 CPU_Bound 3861 Name: count, dtype: int64 ==============================
Numerical¶
In [16]:
# Set up subplots dynamically
fig, axes = plt.subplots(nrows=21, ncols=1, figsize=(15, 100))
# Draw histplot for each numerical column
for i, column in enumerate(num_columns):
sns.histplot(df[column], bins=20, kde=True, ax=axes[i])
axes[i].set_title(f'Histogram for {column}')
# Adjust layout to prevent overlapping titles
plt.tight_layout()
# Show the plot
plt.show()
In [870]:
# Calculate and display histogram values for each numerical column
for column in num_columns:
hist_values, bin_edges = np.histogram(df[column], bins=20)
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
print(f'Histogram values for {column}:')
for center, value in zip(bin_centers, hist_values):
print(f'Bin Center: {center}, Count: {value}')
print('\n')
Histogram values for lread: Bin Center: 1.175, Count: 2457 Bin Center: 3.5250000000000004, Count: 947 Bin Center: 5.875, Count: 924 Bin Center: 8.225000000000001, Count: 409 Bin Center: 10.575, Count: 359 Bin Center: 12.925, Count: 471 Bin Center: 15.275, Count: 238 Bin Center: 17.625, Count: 218 Bin Center: 19.975, Count: 277 Bin Center: 22.325000000000003, Count: 146 Bin Center: 24.675, Count: 113 Bin Center: 27.025000000000002, Count: 164 Bin Center: 29.375, Count: 100 Bin Center: 31.725, Count: 86 Bin Center: 34.075, Count: 133 Bin Center: 36.425, Count: 83 Bin Center: 38.775000000000006, Count: 67 Bin Center: 41.125, Count: 97 Bin Center: 43.475, Count: 64 Bin Center: 45.825, Count: 839 Histogram values for lwrite: Bin Center: 0.625, Count: 4213 Bin Center: 1.875, Count: 615 Bin Center: 3.125, Count: 284 Bin Center: 4.375, Count: 253 Bin Center: 5.625, Count: 372 Bin Center: 6.875, Count: 126 Bin Center: 8.125, Count: 113 Bin Center: 9.375, Count: 100 Bin Center: 10.625, Count: 182 Bin Center: 11.875, Count: 75 Bin Center: 13.125, Count: 100 Bin Center: 14.375, Count: 62 Bin Center: 15.625, Count: 94 Bin Center: 16.875, Count: 43 Bin Center: 18.125, Count: 43 Bin Center: 19.375, Count: 33 Bin Center: 20.625, Count: 63 Bin Center: 21.875, Count: 28 Bin Center: 23.125, Count: 29 Bin Center: 24.375, Count: 1364 Histogram values for scall: Bin Center: 275.653125, Count: 1026 Bin Center: 608.9593749999999, Count: 596 Bin Center: 942.2656249999999, Count: 608 Bin Center: 1275.5718749999999, Count: 681 Bin Center: 1608.878125, Count: 643 Bin Center: 1942.1843749999998, Count: 657 Bin Center: 2275.4906249999995, Count: 672 Bin Center: 2608.796875, Count: 535 Bin Center: 2942.1031249999996, Count: 457 Bin Center: 3275.409375, Count: 425 Bin Center: 3608.715625, Count: 383 Bin Center: 3942.0218749999995, Count: 333 Bin Center: 4275.328125, Count: 293 Bin Center: 4608.634375, Count: 223 Bin Center: 4941.940624999999, Count: 153 Bin Center: 5275.246875, Count: 135 Bin Center: 5608.553124999999, Count: 95 Bin Center: 5941.859375, Count: 74 Bin Center: 6275.165625, Count: 59 Bin Center: 6608.471874999999, Count: 144 Histogram values for sread: Bin Center: 20.0625, Count: 705 Bin Center: 48.1875, Count: 725 Bin Center: 76.3125, Count: 724 Bin Center: 104.4375, Count: 749 Bin Center: 132.5625, Count: 713 Bin Center: 160.6875, Count: 704 Bin Center: 188.8125, Count: 613 Bin Center: 216.9375, Count: 523 Bin Center: 245.0625, Count: 431 Bin Center: 273.1875, Count: 375 Bin Center: 301.3125, Count: 306 Bin Center: 329.4375, Count: 251 Bin Center: 357.5625, Count: 231 Bin Center: 385.6875, Count: 172 Bin Center: 413.8125, Count: 163 Bin Center: 441.9375, Count: 133 Bin Center: 470.0625, Count: 125 Bin Center: 498.1875, Count: 92 Bin Center: 526.3125, Count: 61 Bin Center: 554.4375, Count: 396 Histogram values for swrite: Bin Center: 16.025, Count: 607 Bin Center: 34.075, Count: 693 Bin Center: 52.125, Count: 688 Bin Center: 70.17500000000001, Count: 708 Bin Center: 88.225, Count: 680 Bin Center: 106.275, Count: 656 Bin Center: 124.32500000000002, Count: 660 Bin Center: 142.375, Count: 599 Bin Center: 160.425, Count: 499 Bin Center: 178.47500000000002, Count: 400 Bin Center: 196.525, Count: 314 Bin Center: 214.57500000000002, Count: 245 Bin Center: 232.625, Count: 203 Bin Center: 250.675, Count: 184 Bin Center: 268.725, Count: 144 Bin Center: 286.775, Count: 91 Bin Center: 304.82500000000005, Count: 88 Bin Center: 322.875, Count: 91 Bin Center: 340.925, Count: 81 Bin Center: 358.975, Count: 561 Histogram values for fork: Bin Center: 0.12250000000000001, Count: 2021 Bin Center: 0.36750000000000005, Count: 967 Bin Center: 0.6125, Count: 717 Bin Center: 0.8575000000000002, Count: 564 Bin Center: 1.1025, Count: 714 Bin Center: 1.3475000000000001, Count: 271 Bin Center: 1.5925000000000002, Count: 219 Bin Center: 1.8375000000000001, Count: 257 Bin Center: 2.0825, Count: 417 Bin Center: 2.3275, Count: 140 Bin Center: 2.5725000000000002, Count: 148 Bin Center: 2.8175000000000003, Count: 119 Bin Center: 3.0625000000000004, Count: 110 Bin Center: 3.3075, Count: 188 Bin Center: 3.5525, Count: 79 Bin Center: 3.7975000000000003, Count: 71 Bin Center: 4.0425, Count: 60 Bin Center: 4.2875, Count: 92 Bin Center: 4.532500000000001, Count: 54 Bin Center: 4.7775, Count: 984 Histogram values for exec: Bin Center: 0.16749999999999998, Count: 2082 Bin Center: 0.5025, Count: 1169 Bin Center: 0.8374999999999999, Count: 802 Bin Center: 1.1724999999999999, Count: 291 Bin Center: 1.5074999999999998, Count: 472 Bin Center: 1.8424999999999998, Count: 642 Bin Center: 2.1774999999999998, Count: 282 Bin Center: 2.5124999999999997, Count: 368 Bin Center: 2.8474999999999997, Count: 288 Bin Center: 3.1824999999999997, Count: 142 Bin Center: 3.5174999999999996, Count: 216 Bin Center: 3.8524999999999996, Count: 182 Bin Center: 4.1875, Count: 68 Bin Center: 4.522499999999999, Count: 130 Bin Center: 4.8575, Count: 100 Bin Center: 5.192499999999999, Count: 43 Bin Center: 5.5275, Count: 69 Bin Center: 5.862499999999999, Count: 59 Bin Center: 6.1975, Count: 28 Bin Center: 6.532499999999999, Count: 759 Histogram values for rchar: Bin Center: 15550.953125, Count: 1921 Bin Center: 46096.859375, Count: 807 Bin Center: 76642.765625, Count: 609 Bin Center: 107188.671875, Count: 659 Bin Center: 137734.578125, Count: 637 Bin Center: 168280.484375, Count: 477 Bin Center: 198826.390625, Count: 420 Bin Center: 229372.296875, Count: 370 Bin Center: 259918.203125, Count: 358 Bin Center: 290464.109375, Count: 285 Bin Center: 321010.015625, Count: 229 Bin Center: 351555.921875, Count: 208 Bin Center: 382101.828125, Count: 152 Bin Center: 412647.734375, Count: 131 Bin Center: 443193.640625, Count: 99 Bin Center: 473739.546875, Count: 95 Bin Center: 504285.453125, Count: 75 Bin Center: 534831.359375, Count: 82 Bin Center: 565377.265625, Count: 58 Bin Center: 595923.171875, Count: 520 Histogram values for wchar: Bin Center: 7226.196875, Count: 951 Bin Center: 18682.590624999997, Count: 1264 Bin Center: 30138.984374999996, Count: 1163 Bin Center: 41595.378124999996, Count: 779 Bin Center: 53051.771875, Count: 638 Bin Center: 64508.165624999994, Count: 456 Bin Center: 75964.55937499998, Count: 334 Bin Center: 87420.953125, Count: 293 Bin Center: 98877.34687499999, Count: 233 Bin Center: 110333.740625, Count: 217 Bin Center: 121790.134375, Count: 165 Bin Center: 133246.52812499998, Count: 149 Bin Center: 144702.921875, Count: 131 Bin Center: 156159.315625, Count: 108 Bin Center: 167615.70937499998, Count: 84 Bin Center: 179072.103125, Count: 93 Bin Center: 190528.49687499998, Count: 89 Bin Center: 201984.890625, Count: 79 Bin Center: 213441.284375, Count: 74 Bin Center: 224897.67812499998, Count: 892 Histogram values for pgout: Bin Center: 0.15, Count: 5018 Bin Center: 0.44999999999999996, Count: 141 Bin Center: 0.75, Count: 262 Bin Center: 1.0499999999999998, Count: 113 Bin Center: 1.35, Count: 202 Bin Center: 1.65, Count: 109 Bin Center: 1.95, Count: 176 Bin Center: 2.25, Count: 74 Bin Center: 2.55, Count: 168 Bin Center: 2.8499999999999996, Count: 90 Bin Center: 3.15, Count: 119 Bin Center: 3.4499999999999997, Count: 66 Bin Center: 3.75, Count: 109 Bin Center: 4.05, Count: 99 Bin Center: 4.35, Count: 97 Bin Center: 4.65, Count: 49 Bin Center: 4.949999999999999, Count: 86 Bin Center: 5.25, Count: 74 Bin Center: 5.55, Count: 73 Bin Center: 5.85, Count: 1067 Histogram values for ppgout: Bin Center: 0.2625, Count: 5093 Bin Center: 0.7875000000000001, Count: 283 Bin Center: 1.3125, Count: 163 Bin Center: 1.8375000000000001, Count: 191 Bin Center: 2.3625, Count: 126 Bin Center: 2.8875, Count: 73 Bin Center: 3.4125000000000005, Count: 115 Bin Center: 3.9375, Count: 78 Bin Center: 4.4625, Count: 111 Bin Center: 4.987500000000001, Count: 97 Bin Center: 5.5125, Count: 61 Bin Center: 6.0375000000000005, Count: 80 Bin Center: 6.5625, Count: 63 Bin Center: 7.0875, Count: 41 Bin Center: 7.612500000000001, Count: 77 Bin Center: 8.1375, Count: 52 Bin Center: 8.662500000000001, Count: 48 Bin Center: 9.1875, Count: 44 Bin Center: 9.7125, Count: 35 Bin Center: 10.2375, Count: 1361 Histogram values for pgfree: Bin Center: 0.3125, Count: 5181 Bin Center: 0.9375, Count: 237 Bin Center: 1.5625, Count: 191 Bin Center: 2.1875, Count: 134 Bin Center: 2.8125, Count: 109 Bin Center: 3.4375, Count: 93 Bin Center: 4.0625, Count: 98 Bin Center: 4.6875, Count: 89 Bin Center: 5.3125, Count: 82 Bin Center: 5.9375, Count: 61 Bin Center: 6.5625, Count: 42 Bin Center: 7.1875, Count: 37 Bin Center: 7.8125, Count: 46 Bin Center: 8.4375, Count: 39 Bin Center: 9.0625, Count: 37 Bin Center: 9.6875, Count: 35 Bin Center: 10.3125, Count: 50 Bin Center: 10.9375, Count: 28 Bin Center: 11.5625, Count: 26 Bin Center: 12.1875, Count: 1577 Histogram values for pgscan: Bin Center: -0.475, Count: 0 Bin Center: -0.42500000000000004, Count: 0 Bin Center: -0.375, Count: 0 Bin Center: -0.32499999999999996, Count: 0 Bin Center: -0.275, Count: 0 Bin Center: -0.22499999999999998, Count: 0 Bin Center: -0.17499999999999996, Count: 0 Bin Center: -0.12499999999999997, Count: 0 Bin Center: -0.07499999999999998, Count: 0 Bin Center: -0.024999999999999994, Count: 0 Bin Center: 0.025000000000000022, Count: 8192 Bin Center: 0.07500000000000007, Count: 0 Bin Center: 0.12500000000000006, Count: 0 Bin Center: 0.17500000000000004, Count: 0 Bin Center: 0.22500000000000003, Count: 0 Bin Center: 0.275, Count: 0 Bin Center: 0.32500000000000007, Count: 0 Bin Center: 0.37500000000000006, Count: 0 Bin Center: 0.42500000000000004, Count: 0 Bin Center: 0.47500000000000003, Count: 0 Histogram values for atch: Bin Center: 0.0375, Count: 4575 Bin Center: 0.11249999999999999, Count: 0 Bin Center: 0.1875, Count: 804 Bin Center: 0.26249999999999996, Count: 0 Bin Center: 0.3375, Count: 0 Bin Center: 0.4125, Count: 504 Bin Center: 0.4875, Count: 0 Bin Center: 0.5625, Count: 0 Bin Center: 0.6375, Count: 307 Bin Center: 0.7124999999999999, Count: 0 Bin Center: 0.7875, Count: 289 Bin Center: 0.8624999999999999, Count: 0 Bin Center: 0.9375, Count: 0 Bin Center: 1.0125, Count: 203 Bin Center: 1.0875, Count: 0 Bin Center: 1.1625, Count: 3 Bin Center: 1.2374999999999998, Count: 171 Bin Center: 1.3125, Count: 0 Bin Center: 1.3875, Count: 127 Bin Center: 1.4625, Count: 1209 Histogram values for pgin: Bin Center: 0.5878125000000001, Count: 2766 Bin Center: 1.7634375000000002, Count: 1014 Bin Center: 2.9390625000000004, Count: 668 Bin Center: 4.1146875000000005, Count: 480 Bin Center: 5.290312500000001, Count: 343 Bin Center: 6.465937500000001, Count: 312 Bin Center: 7.641562500000001, Count: 302 Bin Center: 8.817187500000001, Count: 224 Bin Center: 9.992812500000001, Count: 168 Bin Center: 11.168437500000001, Count: 181 Bin Center: 12.344062500000001, Count: 164 Bin Center: 13.519687500000002, Count: 128 Bin Center: 14.695312500000002, Count: 125 Bin Center: 15.870937500000002, Count: 103 Bin Center: 17.0465625, Count: 87 Bin Center: 18.222187500000004, Count: 76 Bin Center: 19.3978125, Count: 68 Bin Center: 20.573437500000004, Count: 80 Bin Center: 21.7490625, Count: 55 Bin Center: 22.924687500000005, Count: 848 Histogram values for ppgin: Bin Center: 0.8400000000000001, Count: 3029 Bin Center: 2.5200000000000005, Count: 860 Bin Center: 4.200000000000001, Count: 627 Bin Center: 5.880000000000001, Count: 433 Bin Center: 7.5600000000000005, Count: 327 Bin Center: 9.240000000000002, Count: 330 Bin Center: 10.920000000000002, Count: 245 Bin Center: 12.600000000000001, Count: 249 Bin Center: 14.280000000000001, Count: 174 Bin Center: 15.96, Count: 151 Bin Center: 17.64, Count: 157 Bin Center: 19.32, Count: 110 Bin Center: 21.000000000000004, Count: 118 Bin Center: 22.680000000000003, Count: 105 Bin Center: 24.360000000000003, Count: 108 Bin Center: 26.040000000000003, Count: 84 Bin Center: 27.720000000000002, Count: 74 Bin Center: 29.400000000000002, Count: 68 Bin Center: 31.080000000000002, Count: 61 Bin Center: 32.760000000000005, Count: 882 Histogram values for pflt: Bin Center: 9.0375, Count: 1557 Bin Center: 27.112499999999997, Count: 1224 Bin Center: 45.1875, Count: 942 Bin Center: 63.262499999999996, Count: 665 Bin Center: 81.3375, Count: 496 Bin Center: 99.4125, Count: 426 Bin Center: 117.48749999999998, Count: 315 Bin Center: 135.5625, Count: 306 Bin Center: 153.6375, Count: 281 Bin Center: 171.71249999999998, Count: 249 Bin Center: 189.7875, Count: 224 Bin Center: 207.86249999999998, Count: 208 Bin Center: 225.9375, Count: 186 Bin Center: 244.0125, Count: 179 Bin Center: 262.0875, Count: 121 Bin Center: 280.1625, Count: 92 Bin Center: 298.23749999999995, Count: 94 Bin Center: 316.3125, Count: 80 Bin Center: 334.3875, Count: 74 Bin Center: 352.4625, Count: 473 Histogram values for vflt: Bin Center: 14.23, Count: 1422 Bin Center: 42.290000000000006, Count: 986 Bin Center: 70.35000000000001, Count: 844 Bin Center: 98.41000000000001, Count: 678 Bin Center: 126.47, Count: 574 Bin Center: 154.53, Count: 524 Bin Center: 182.59, Count: 448 Bin Center: 210.65, Count: 368 Bin Center: 238.71, Count: 310 Bin Center: 266.77, Count: 228 Bin Center: 294.83000000000004, Count: 216 Bin Center: 322.89, Count: 173 Bin Center: 350.95000000000005, Count: 151 Bin Center: 379.01, Count: 165 Bin Center: 407.07000000000005, Count: 124 Bin Center: 435.13, Count: 131 Bin Center: 463.19000000000005, Count: 109 Bin Center: 491.25, Count: 102 Bin Center: 519.3100000000001, Count: 86 Bin Center: 547.3700000000001, Count: 553 Histogram values for freemem: Bin Center: 170.103125, Count: 2557 Bin Center: 400.30937500000005, Count: 1283 Bin Center: 630.515625, Count: 850 Bin Center: 860.7218750000001, Count: 436 Bin Center: 1090.928125, Count: 368 Bin Center: 1321.134375, Count: 260 Bin Center: 1551.3406250000003, Count: 169 Bin Center: 1781.546875, Count: 151 Bin Center: 2011.7531250000002, Count: 161 Bin Center: 2241.9593750000004, Count: 77 Bin Center: 2472.165625, Count: 134 Bin Center: 2702.3718750000003, Count: 108 Bin Center: 2932.578125, Count: 114 Bin Center: 3162.784375, Count: 67 Bin Center: 3392.9906250000004, Count: 53 Bin Center: 3623.196875, Count: 64 Bin Center: 3853.4031250000003, Count: 43 Bin Center: 4083.6093750000005, Count: 56 Bin Center: 4313.815625, Count: 14 Bin Center: 4544.021875, Count: 1227 Histogram values for freeswap: Bin Center: 66794.4375, Count: 294 Bin Center: 178404.3125, Count: 0 Bin Center: 290014.1875, Count: 0 Bin Center: 401624.0625, Count: 0 Bin Center: 513233.9375, Count: 0 Bin Center: 624843.8125, Count: 0 Bin Center: 736453.6875, Count: 0 Bin Center: 848063.5625, Count: 0 Bin Center: 959673.4375, Count: 1159 Bin Center: 1071283.3125, Count: 2521 Bin Center: 1182893.1875, Count: 117 Bin Center: 1294503.0625, Count: 285 Bin Center: 1406112.9375, Count: 305 Bin Center: 1517722.8125, Count: 538 Bin Center: 1629332.6875, Count: 325 Bin Center: 1740942.5625, Count: 1436 Bin Center: 1852552.4375, Count: 1208 Bin Center: 1964162.3125, Count: 1 Bin Center: 2075772.1875, Count: 1 Bin Center: 2187382.0625, Count: 2 Histogram values for usr: Bin Center: 62.4375, Count: 489 Bin Center: 64.3125, Count: 52 Bin Center: 66.1875, Count: 75 Bin Center: 68.0625, Count: 46 Bin Center: 69.9375, Count: 93 Bin Center: 71.8125, Count: 126 Bin Center: 73.6875, Count: 169 Bin Center: 75.5625, Count: 223 Bin Center: 77.4375, Count: 270 Bin Center: 79.3125, Count: 316 Bin Center: 81.1875, Count: 388 Bin Center: 83.0625, Count: 230 Bin Center: 84.9375, Count: 506 Bin Center: 86.8125, Count: 621 Bin Center: 88.6875, Count: 760 Bin Center: 90.5625, Count: 907 Bin Center: 92.4375, Count: 837 Bin Center: 94.3125, Count: 826 Bin Center: 96.1875, Count: 820 Bin Center: 98.0625, Count: 438
In [17]:
# Set up subplot for boxplot
fig, axes = plt.subplots(figsize=(20,16))
# Draw boxplot for all numerical columns
sns.boxplot(data=df[num_columns], ax=axes)
axes.set_title('Boxplot for Numerical Columns')
# Show the boxplot
plt.show()
In [ ]:
In [18]:
# Create a boxplot using Plotly Express
fig = px.box(df, y=num_columns, title='Boxplot for Numerical Columns')
fig.update_layout(height=600, width=1000)
fig.update_xaxes(tickangle=90)
# Show the plot
fig.show()
Bivariate Analysis¶
Numeric Vs Numeric¶
In [ ]:
# Pairplot
sns.pairplot(df[num_columns])
plt.suptitle("Pairplot of Numerical Variables", y=1.02)
sns.set(rc={'figure.figsize':(15.7,20.27)})
plt.savefig("pair_NvN.png", format="png", bbox_inches='tight')
plt.show()
# Correlation Matrix
correlation_matrix = df[num_columns].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
sns.set(rc={'figure.figsize':(15.7,20.27)})
#plt.savefig("corr_NvN.png", format="png", bbox_inches='tight')
plt.show()
# Scatterplots
for i in range(len(num_columns)):
for j in range(i+1, len(num_columns)):
plt.scatter(df[num_columns[i]], df[num_columns[j]])
plt.xlabel(num_columns[i])
plt.ylabel(num_columns[j])
plt.title(f'Scatterplot: {num_columns[i]} vs {num_columns[j]}')
#plt.savefig(f"scatter_NvN_{num_columns[i]}_vs_{num_columns[j]}.png", format="png", bbox_inches='tight')
plt.show()
# Jointplots
#for i in range(len(num_columns)):
# for j in range(i+1, len(num_columns)):
# sns.jointplot(x=num_columns[i], y=num_columns[j], data=df, kind='scatter')
# fig.suptitle(f'Jointplot: {num_columns[i]} vs {num_columns[j]}', y=1.02)
#plt.savefig(f"joint_NvN_{num_columns[i]}_vs_{num_columns[j]}.png", format="png", bbox_inches='tight')
# plt.show()
In [ ]:
# Output for the above cell has been redacted due large file size
In [20]:
# Convert correlation matrix to a long-form DataFrame for sorting
correlation_df = correlation_matrix.unstack().reset_index(name='Correlation')
sorted_correlation_df = correlation_df.sort_values(by='Correlation', ascending=False)
# Print the sorted correlation values
print("Sorted Correlation Matrix (Descending Order):")
for idx, row in sorted_correlation_df.iterrows():
print(f"{row['level_0']} vs {row['level_1']}: {row['Correlation']:.2f}")
Sorted Correlation Matrix (Descending Order): lread vs lread: 1.00 pgfree vs pgfree: 1.00 sread vs sread: 1.00 swrite vs swrite: 1.00 fork vs fork: 1.00 exec vs exec: 1.00 rchar vs rchar: 1.00 wchar vs wchar: 1.00 pgout vs pgout: 1.00 pgscan vs pgscan: 1.00 lwrite vs lwrite: 1.00 atch vs atch: 1.00 pgin vs pgin: 1.00 ppgin vs ppgin: 1.00 pflt vs pflt: 1.00 vflt vs vflt: 1.00 freemem vs freemem: 1.00 freeswap vs freeswap: 1.00 scall vs scall: 1.00 ppgout vs ppgout: 1.00 usr vs usr: 1.00 fork vs vflt: 0.94 vflt vs fork: 0.94 pflt vs vflt: 0.94 vflt vs pflt: 0.94 pflt vs fork: 0.93 fork vs pflt: 0.93 ppgin vs pgin: 0.92 pgin vs ppgin: 0.92 pgfree vs ppgout: 0.92 ppgout vs pgfree: 0.92 pgscan vs pgfree: 0.92 pgfree vs pgscan: 0.92 swrite vs sread: 0.88 sread vs swrite: 0.88 ppgout vs pgout: 0.87 pgout vs ppgout: 0.87 ppgout vs pgscan: 0.79 pgscan vs ppgout: 0.79 fork vs exec: 0.76 exec vs fork: 0.76 pgout vs pgfree: 0.73 pgfree vs pgout: 0.73 sread vs scall: 0.70 scall vs sread: 0.70 vflt vs exec: 0.69 exec vs vflt: 0.69 freeswap vs usr: 0.68 usr vs freeswap: 0.68 pflt vs exec: 0.65 exec vs pflt: 0.65 swrite vs scall: 0.62 scall vs swrite: 0.62 pgfree vs ppgin: 0.59 ppgin vs pgfree: 0.59 freemem vs freeswap: 0.57 freeswap vs freemem: 0.57 pgscan vs ppgin: 0.56 ppgin vs pgscan: 0.56 pgscan vs pgout: 0.55 pgout vs pgscan: 0.55 ppgout vs ppgin: 0.54 ppgin vs ppgout: 0.54 lread vs lwrite: 0.53 lwrite vs lread: 0.53 pgin vs pgfree: 0.53 pgfree vs pgin: 0.53 vflt vs scall: 0.53 scall vs vflt: 0.53 rchar vs wchar: 0.50 wchar vs rchar: 0.50 sread vs rchar: 0.50 rchar vs sread: 0.50 pgscan vs pgin: 0.50 pgin vs pgscan: 0.50 vflt vs sread: 0.49 sread vs vflt: 0.49 ppgout vs pgin: 0.49 pgin vs ppgout: 0.49 pflt vs scall: 0.48 scall vs pflt: 0.48 sread vs pflt: 0.45 pflt vs sread: 0.45 fork vs scall: 0.45 scall vs fork: 0.45 fork vs sread: 0.42 sread vs fork: 0.42 vflt vs swrite: 0.42 swrite vs vflt: 0.42 ppgin vs pgout: 0.41 pgout vs ppgin: 0.41 wchar vs sread: 0.40 sread vs wchar: 0.40 swrite vs pflt: 0.40 pflt vs swrite: 0.40 wchar vs swrite: 0.39 swrite vs wchar: 0.39 pgin vs pgout: 0.39 pgout vs pgin: 0.39 swrite vs fork: 0.38 fork vs swrite: 0.38 rchar vs vflt: 0.36 vflt vs rchar: 0.36 rchar vs scall: 0.35 scall vs rchar: 0.35 rchar vs ppgin: 0.35 ppgin vs rchar: 0.35 swrite vs rchar: 0.33 rchar vs swrite: 0.33 pflt vs rchar: 0.31 rchar vs pflt: 0.31 scall vs exec: 0.31 exec vs scall: 0.31 vflt vs pgin: 0.30 pgin vs vflt: 0.30 pgfree vs vflt: 0.30 vflt vs pgfree: 0.30 rchar vs pgin: 0.30 pgin vs rchar: 0.30 vflt vs ppgout: 0.29 ppgout vs vflt: 0.29 pgscan vs vflt: 0.28 vflt vs pgscan: 0.28 fork vs rchar: 0.28 rchar vs fork: 0.28 pgfree vs rchar: 0.28 rchar vs pgfree: 0.28 wchar vs scall: 0.27 scall vs wchar: 0.27 usr vs freemem: 0.27 freemem vs usr: 0.27 rchar vs ppgout: 0.27 ppgout vs rchar: 0.27 vflt vs ppgin: 0.26 ppgin vs vflt: 0.26 pgscan vs rchar: 0.26 rchar vs pgscan: 0.26 scall vs pgin: 0.24 pgin vs scall: 0.24 vflt vs pgout: 0.23 pgout vs vflt: 0.23 sread vs ppgout: 0.23 ppgout vs sread: 0.23 ppgin vs scall: 0.22 scall vs ppgin: 0.22 sread vs pgfree: 0.21 pgfree vs sread: 0.21 rchar vs pgout: 0.21 pgout vs rchar: 0.21 sread vs ppgin: 0.21 ppgin vs sread: 0.21 scall vs ppgout: 0.21 ppgout vs scall: 0.21 sread vs pgin: 0.21 pgin vs sread: 0.21 wchar vs ppgin: 0.20 ppgin vs wchar: 0.20 scall vs pgfree: 0.20 pgfree vs scall: 0.20 scall vs pgout: 0.19 pgout vs scall: 0.19 pgscan vs sread: 0.19 sread vs pgscan: 0.19 sread vs pgout: 0.19 pgout vs sread: 0.19 wchar vs pgout: 0.19 pgout vs wchar: 0.19 scall vs lread: 0.19 lread vs scall: 0.19 pgfree vs pflt: 0.19 pflt vs pgfree: 0.19 lread vs pgin: 0.19 pgin vs lread: 0.19 wchar vs ppgout: 0.19 ppgout vs wchar: 0.19 exec vs pgin: 0.19 pgin vs exec: 0.19 ppgout vs pflt: 0.19 pflt vs ppgout: 0.19 wchar vs atch: 0.18 atch vs wchar: 0.18 pgscan vs pflt: 0.18 pflt vs pgscan: 0.18 pgin vs wchar: 0.18 wchar vs pgin: 0.18 scall vs pgscan: 0.18 pgscan vs scall: 0.18 pgin vs pflt: 0.18 pflt vs pgin: 0.18 atch vs rchar: 0.17 rchar vs atch: 0.17 exec vs rchar: 0.17 rchar vs exec: 0.17 fork vs pgfree: 0.17 pgfree vs fork: 0.17 ppgout vs fork: 0.17 fork vs ppgout: 0.17 vflt vs lread: 0.17 lread vs vflt: 0.17 exec vs sread: 0.16 sread vs exec: 0.16 fork vs pgin: 0.16 pgin vs fork: 0.16 lread vs ppgin: 0.16 ppgin vs lread: 0.16 fork vs pgscan: 0.16 pgscan vs fork: 0.16 swrite vs ppgout: 0.16 ppgout vs swrite: 0.16 wchar vs pgfree: 0.16 pgfree vs wchar: 0.16 pgout vs swrite: 0.15 swrite vs pgout: 0.15 pgout vs pflt: 0.15 pflt vs pgout: 0.15 ppgin vs pflt: 0.15 pflt vs ppgin: 0.15 ppgin vs exec: 0.15 exec vs ppgin: 0.15 exec vs ppgout: 0.15 ppgout vs exec: 0.15 pgout vs atch: 0.15 atch vs pgout: 0.15 swrite vs pgin: 0.15 pgin vs swrite: 0.15 exec vs pgfree: 0.15 pgfree vs exec: 0.15 pgfree vs swrite: 0.15 swrite vs pgfree: 0.15 exec vs pgscan: 0.14 pgscan vs exec: 0.14 swrite vs ppgin: 0.14 ppgin vs swrite: 0.14 lwrite vs scall: 0.14 scall vs lwrite: 0.14 fork vs lread: 0.14 lread vs fork: 0.14 lread vs pflt: 0.14 pflt vs lread: 0.14 sread vs lread: 0.13 lread vs sread: 0.13 ppgin vs fork: 0.13 fork vs ppgin: 0.13 lread vs ppgout: 0.13 ppgout vs lread: 0.13 fork vs pgout: 0.13 pgout vs fork: 0.13 sread vs lwrite: 0.13 lwrite vs sread: 0.13 swrite vs pgscan: 0.12 pgscan vs swrite: 0.12 swrite vs lread: 0.12 lread vs swrite: 0.12 rchar vs lwrite: 0.12 lwrite vs rchar: 0.12 lread vs pgfree: 0.11 pgfree vs lread: 0.11 wchar vs pgscan: 0.11 pgscan vs wchar: 0.11 exec vs pgout: 0.11 pgout vs exec: 0.11 vflt vs wchar: 0.11 wchar vs vflt: 0.11 exec vs lread: 0.11 lread vs exec: 0.11 lread vs rchar: 0.11 rchar vs lread: 0.11 swrite vs exec: 0.10 exec vs swrite: 0.10 swrite vs lwrite: 0.10 lwrite vs swrite: 0.10 atch vs vflt: 0.10 vflt vs atch: 0.10 vflt vs lwrite: 0.09 lwrite vs vflt: 0.09 ppgout vs atch: 0.09 atch vs ppgout: 0.09 lwrite vs wchar: 0.09 wchar vs lwrite: 0.09 lwrite vs pgin: 0.09 pgin vs lwrite: 0.09 lwrite vs ppgin: 0.09 ppgin vs lwrite: 0.09 lread vs pgscan: 0.09 pgscan vs lread: 0.09 wchar vs pflt: 0.09 pflt vs wchar: 0.09 atch vs sread: 0.09 sread vs atch: 0.09 pgout vs lread: 0.08 lread vs pgout: 0.08 wchar vs lread: 0.08 lread vs wchar: 0.08 ppgout vs lwrite: 0.08 lwrite vs ppgout: 0.08 scall vs atch: 0.08 atch vs scall: 0.08 pgfree vs atch: 0.07 atch vs pgfree: 0.07 lwrite vs pflt: 0.07 pflt vs lwrite: 0.07 lwrite vs pgout: 0.07 pgout vs lwrite: 0.07 lwrite vs pgfree: 0.07 pgfree vs lwrite: 0.07 atch vs swrite: 0.06 swrite vs atch: 0.06 fork vs wchar: 0.06 wchar vs fork: 0.06 pgin vs atch: 0.06 atch vs pgin: 0.06 atch vs ppgin: 0.06 ppgin vs atch: 0.06 fork vs lwrite: 0.05 lwrite vs fork: 0.05 atch vs exec: 0.05 exec vs atch: 0.05 pflt vs atch: 0.05 atch vs pflt: 0.05 atch vs fork: 0.05 fork vs atch: 0.05 pgscan vs lwrite: 0.04 lwrite vs pgscan: 0.04 pgscan vs atch: 0.04 atch vs pgscan: 0.04 exec vs lwrite: 0.04 lwrite vs exec: 0.04 atch vs lwrite: 0.03 lwrite vs atch: 0.03 atch vs lread: 0.02 lread vs atch: 0.02 wchar vs exec: 0.00 exec vs wchar: 0.00 lread vs freeswap: -0.08 freeswap vs lread: -0.08 freemem vs lread: -0.08 lread vs freemem: -0.08 atch vs freemem: -0.09 freemem vs atch: -0.09 freemem vs lwrite: -0.09 lwrite vs freemem: -0.09 usr vs lwrite: -0.11 lwrite vs usr: -0.11 pflt vs freemem: -0.11 freemem vs pflt: -0.11 freeswap vs lwrite: -0.12 lwrite vs freeswap: -0.12 freeswap vs atch: -0.12 atch vs freeswap: -0.12 fork vs freemem: -0.12 freemem vs fork: -0.12 usr vs atch: -0.13 atch vs usr: -0.13 freeswap vs fork: -0.13 fork vs freeswap: -0.13 pflt vs freeswap: -0.13 freeswap vs pflt: -0.13 usr vs lread: -0.14 lread vs usr: -0.14 freemem vs wchar: -0.15 wchar vs freemem: -0.15 freemem vs rchar: -0.15 rchar vs freemem: -0.15 freeswap vs exec: -0.15 exec vs freeswap: -0.15 freemem vs exec: -0.16 exec vs freemem: -0.16 pgscan vs freeswap: -0.18 freeswap vs pgscan: -0.18 pgscan vs usr: -0.18 usr vs pgscan: -0.18 freemem vs pgscan: -0.19 pgscan vs freemem: -0.19 freemem vs vflt: -0.20 vflt vs freemem: -0.20 freeswap vs pgfree: -0.21 pgfree vs freeswap: -0.21 usr vs ppgout: -0.21 ppgout vs usr: -0.21 ppgout vs freeswap: -0.21 freeswap vs ppgout: -0.21 ppgin vs freemem: -0.22 freemem vs ppgin: -0.22 usr vs pgfree: -0.22 pgfree vs usr: -0.22 rchar vs freeswap: -0.22 freeswap vs rchar: -0.22 usr vs pgout: -0.22 pgout vs usr: -0.22 wchar vs freeswap: -0.23 freeswap vs wchar: -0.23 freemem vs pgin: -0.23 pgin vs freemem: -0.23 usr vs ppgin: -0.23 ppgin vs usr: -0.23 freemem vs pgfree: -0.23 pgfree vs freemem: -0.23 freeswap vs swrite: -0.24 swrite vs freeswap: -0.24 usr vs pgin: -0.24 pgin vs usr: -0.24 freeswap vs pgout: -0.25 pgout vs freeswap: -0.25 vflt vs freeswap: -0.25 freeswap vs vflt: -0.25 freemem vs ppgout: -0.25 ppgout vs freemem: -0.25 swrite vs freemem: -0.25 freemem vs swrite: -0.25 freeswap vs ppgin: -0.25 ppgin vs freeswap: -0.25 freemem vs pgout: -0.27 pgout vs freemem: -0.27 usr vs swrite: -0.27 swrite vs usr: -0.27 pgin vs freeswap: -0.28 freeswap vs pgin: -0.28 freemem vs sread: -0.29 sread vs freemem: -0.29 usr vs exec: -0.29 exec vs usr: -0.29 wchar vs usr: -0.29 usr vs wchar: -0.29 sread vs freeswap: -0.30 freeswap vs sread: -0.30 usr vs scall: -0.32 scall vs usr: -0.32 usr vs rchar: -0.33 rchar vs usr: -0.33 usr vs sread: -0.33 sread vs usr: -0.33 freeswap vs scall: -0.35 scall vs freeswap: -0.35 usr vs fork: -0.36 fork vs usr: -0.36 usr vs pflt: -0.37 pflt vs usr: -0.37 freemem vs scall: -0.39 scall vs freemem: -0.39 usr vs vflt: -0.42 vflt vs usr: -0.42
In [874]:
# Convert correlation matrix to a long-form DataFrame for sorting
mask = np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
correlation_df = correlation_matrix.where(mask)
correlation_df = correlation_df.unstack().reset_index(name='Correlation').dropna()
# Sort the correlation DataFrame
sorted_correlation_df = correlation_df.sort_values(by='Correlation', ascending=False)
# Print the sorted correlation values
print("Sorted Correlation Matrix (Descending Order):")
for idx, row in sorted_correlation_df.iterrows():
print(f"{row['level_0']} vs {row['level_1']}: {row['Correlation']:.2f}")
Sorted Correlation Matrix (Descending Order): pgfree vs ppgout: 0.97 ppgin vs pgin: 0.96 ppgout vs pgout: 0.95 pflt vs fork: 0.94 vflt vs fork: 0.93 vflt vs pflt: 0.93 pgfree vs pgout: 0.91 swrite vs sread: 0.88 lwrite vs lread: 0.83 exec vs fork: 0.77 vflt vs exec: 0.76 sread vs scall: 0.76 pflt vs exec: 0.76 swrite vs scall: 0.74 atch vs pgout: 0.64 atch vs ppgout: 0.61 freeswap vs freemem: 0.61 atch vs pgfree: 0.60 vflt vs sread: 0.60 rchar vs sread: 0.58 usr vs freeswap: 0.56 vflt vs swrite: 0.56 vflt vs scall: 0.55 pflt vs sread: 0.53 fork vs sread: 0.53 fork vs swrite: 0.52 pflt vs swrite: 0.51 wchar vs rchar: 0.49 pflt vs scall: 0.49 ppgin vs pgfree: 0.48 ppgin vs ppgout: 0.48 fork vs scall: 0.47 pgin vs pgfree: 0.46 pgin vs ppgout: 0.46 ppgin vs pgout: 0.45 exec vs scall: 0.44 vflt vs rchar: 0.44 pgin vs pgout: 0.44 wchar vs swrite: 0.43 vflt vs lread: 0.42 rchar vs swrite: 0.42 wchar vs sread: 0.42 vflt vs pgin: 0.40 ppgin vs rchar: 0.39 rchar vs scall: 0.39 vflt vs ppgin: 0.38 pflt vs rchar: 0.38 usr vs freemem: 0.38 pflt vs lread: 0.38 rchar vs fork: 0.37 exec vs sread: 0.37 pgin vs rchar: 0.37 fork vs lread: 0.37 exec vs lread: 0.36 pgin vs sread: 0.35 ppgin vs sread: 0.34 pgin vs scall: 0.34 ppgin vs atch: 0.33 scall vs lread: 0.33 wchar vs scall: 0.33 pgin vs atch: 0.33 sread vs lread: 0.33 ppgin vs scall: 0.33 rchar vs exec: 0.32 vflt vs pgfree: 0.32 vflt vs ppgout: 0.32 ppgout vs sread: 0.32 exec vs swrite: 0.31 pgfree vs sread: 0.31 pgin vs swrite: 0.31 swrite vs lread: 0.31 atch vs scall: 0.31 ppgout vs scall: 0.31 ppgin vs swrite: 0.30 pgout vs sread: 0.30 pgfree vs scall: 0.30 vflt vs pgout: 0.30 pgin vs exec: 0.30 pgout vs scall: 0.30 vflt vs atch: 0.30 atch vs sread: 0.29 ppgin vs lread: 0.29 ppgin vs exec: 0.29 ppgout vs swrite: 0.28 pgin vs lread: 0.28 pgfree vs swrite: 0.28 pgout vs swrite: 0.27 ppgout vs rchar: 0.27 atch vs rchar: 0.27 atch vs swrite: 0.26 pgfree vs rchar: 0.26 ppgin vs wchar: 0.26 rchar vs lread: 0.26 atch vs exec: 0.25 pflt vs pgin: 0.25 pgout vs rchar: 0.25 pgfree vs exec: 0.25 pgin vs wchar: 0.25 ppgout vs exec: 0.25 pgin vs fork: 0.25 pflt vs ppgin: 0.24 ppgin vs fork: 0.24 pgout vs exec: 0.23 atch vs lread: 0.23 pflt vs ppgout: 0.22 pflt vs pgfree: 0.22 ppgout vs lread: 0.22 pgfree vs fork: 0.22 pgfree vs lread: 0.21 ppgout vs fork: 0.21 pflt vs atch: 0.21 pgout vs lread: 0.21 pflt vs pgout: 0.21 ppgout vs wchar: 0.20 atch vs fork: 0.20 pgout vs fork: 0.20 pgout vs wchar: 0.20 pgfree vs wchar: 0.19 wchar vs lread: 0.17 atch vs wchar: 0.16 vflt vs wchar: 0.16 sread vs lwrite: 0.15 scall vs lwrite: 0.14 vflt vs lwrite: 0.14 wchar vs lwrite: 0.13 swrite vs lwrite: 0.13 atch vs lwrite: 0.13 pflt vs wchar: 0.13 wchar vs exec: 0.12 wchar vs fork: 0.12 exec vs lwrite: 0.12 ppgin vs lwrite: 0.12 rchar vs lwrite: 0.12 pgin vs lwrite: 0.11 pflt vs lwrite: 0.10 fork vs lwrite: 0.09 pgout vs lwrite: 0.09 ppgout vs lwrite: 0.09 pgfree vs lwrite: 0.08 freemem vs lwrite: -0.10 freeswap vs pflt: -0.13 freemem vs pflt: -0.13 freeswap vs fork: -0.13 freemem vs fork: -0.14 freemem vs wchar: -0.15 freeswap vs lwrite: -0.15 freemem vs rchar: -0.17 freeswap vs wchar: -0.18 freeswap vs exec: -0.18 usr vs lwrite: -0.19 freemem vs exec: -0.19 freemem vs lread: -0.20 freemem vs vflt: -0.23 freeswap vs rchar: -0.23 freeswap vs lread: -0.24 freeswap vs vflt: -0.26 freemem vs ppgin: -0.30 freemem vs pgin: -0.31 usr vs wchar: -0.32 freeswap vs swrite: -0.34 freeswap vs ppgout: -0.34 freeswap vs pgfree: -0.34 usr vs atch: -0.34 freeswap vs atch: -0.35 freeswap vs pgout: -0.35 freeswap vs ppgin: -0.35 freemem vs sread: -0.35 freemem vs swrite: -0.35 freeswap vs scall: -0.36 freeswap vs pgin: -0.37 freeswap vs sread: -0.37 usr vs pgout: -0.38 usr vs pgfree: -0.38 usr vs ppgout: -0.39 freemem vs scall: -0.39 usr vs lread: -0.44 freemem vs atch: -0.44 usr vs ppgin: -0.45 usr vs pgin: -0.46 freemem vs ppgout: -0.46 freemem vs pgfree: -0.46 freemem vs pgout: -0.47 usr vs rchar: -0.51 usr vs swrite: -0.60 usr vs exec: -0.61 usr vs scall: -0.62 usr vs sread: -0.64 usr vs fork: -0.67 usr vs pflt: -0.70 usr vs vflt: -0.75
Categorical to Numerical¶
In [21]:
# Set up subplots
fig, axes = plt.subplots(nrows=len(num_columns), ncols=len(cat_columns), figsize=(15, 100))
# Flatten the axes array
axes = axes.flatten()
# Iterate through numerical and categorical columns for boxplots
for i, num_col in enumerate(num_columns):
for j, cat_col in enumerate(cat_columns):
# Create boxplot for the current combination
sns.boxplot(x=cat_col, y=num_col, data=df, ax=axes[i * len(cat_columns) + j])
axes[i * len(cat_columns) + j].set_title(f'{num_col} vs {cat_col}')
# Adjust layout
plt.tight_layout()
plt.show()
In [ ]:
# Iterate through numerical and categorical columns for statistical values
for num_col in num_columns:
for cat_col in cat_columns:
# Extract statistical values for the current combination
stats = df.groupby(cat_col)[num_col].describe()
# Print the statistics
print(f"\nStatistics for {num_col} vs {cat_col}:\n{stats}")
In [22]:
# Convert categorical columns to numerical using one-hot encoding
df_encoded = pd.get_dummies(df, columns=cat_columns)
# Exclude numerical columns from encoded categorical columns
cat_columns_encoded = [col for col in df_encoded.columns if col not in num_columns]
num_columns = [col for col in df_encoded.columns if col not in cat_columns_encoded]
# Combine numerical and encoded categorical columns
all_columns = num_columns + cat_columns_encoded
# Compute the correlation matrix
corr_matrix = df_encoded[all_columns].corr()
# Set up the matplotlib figure
plt.figure(figsize=(30, 30))
# Create a heatmap for the correlation matrix
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
# Set the title
plt.title('Correlation Heatmap for Numerical and Categorical Variables')
# Show the plot
plt.show()
In [23]:
# Flatten the upper triangle of the correlation matrix
corr_values = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)).stack().sort_values(ascending=False)
# Display correlations in descending order
print("Correlations in Descending Order:")
for index, value in corr_values.items():
print(f"{index[0]} vs {index[1]}: {value:.2f}")
Correlations in Descending Order: fork vs vflt: 0.94 pflt vs vflt: 0.94 fork vs pflt: 0.93 pgin vs ppgin: 0.92 ppgout vs pgfree: 0.92 pgfree vs pgscan: 0.92 sread vs swrite: 0.88 pgout vs ppgout: 0.87 ppgout vs pgscan: 0.79 fork vs exec: 0.76 pgout vs pgfree: 0.73 scall vs sread: 0.70 exec vs vflt: 0.69 freeswap vs usr: 0.68 exec vs pflt: 0.65 scall vs swrite: 0.62 pgfree vs ppgin: 0.59 freemem vs freeswap: 0.57 pgscan vs ppgin: 0.56 pgout vs pgscan: 0.55 ppgout vs ppgin: 0.54 lread vs lwrite: 0.53 pgfree vs pgin: 0.53 scall vs vflt: 0.53 rchar vs wchar: 0.50 sread vs rchar: 0.50 pgscan vs pgin: 0.50 sread vs vflt: 0.49 ppgout vs pgin: 0.49 scall vs pflt: 0.48 sread vs pflt: 0.45 scall vs fork: 0.45 sread vs fork: 0.42 swrite vs vflt: 0.42 pgout vs ppgin: 0.41 sread vs wchar: 0.40 swrite vs pflt: 0.40 swrite vs wchar: 0.39 pgout vs pgin: 0.39 swrite vs fork: 0.38 rchar vs vflt: 0.36 scall vs rchar: 0.35 rchar vs ppgin: 0.35 swrite vs rchar: 0.33 rchar vs pflt: 0.31 scall vs exec: 0.31 pgin vs vflt: 0.30 pgfree vs vflt: 0.30 rchar vs pgin: 0.30 ppgout vs vflt: 0.29 pgscan vs vflt: 0.28 fork vs rchar: 0.28 rchar vs pgfree: 0.28 scall vs wchar: 0.27 freemem vs usr: 0.27 rchar vs ppgout: 0.27 ppgin vs vflt: 0.26 usr vs runqsz_Not_CPU_Bound: 0.26 rchar vs pgscan: 0.26 scall vs runqsz_CPU_Bound: 0.24 scall vs pgin: 0.24 pgout vs vflt: 0.23 sread vs ppgout: 0.23 scall vs ppgin: 0.22 sread vs pgfree: 0.21 rchar vs pgout: 0.21 sread vs ppgin: 0.21 scall vs ppgout: 0.21 sread vs pgin: 0.21 wchar vs ppgin: 0.20 scall vs pgfree: 0.20 scall vs pgout: 0.19 rchar vs runqsz_CPU_Bound: 0.19 sread vs pgscan: 0.19 sread vs pgout: 0.19 wchar vs pgout: 0.19 lread vs scall: 0.19 pgfree vs pflt: 0.19 lread vs pgin: 0.19 wchar vs ppgout: 0.19 exec vs pgin: 0.19 ppgout vs pflt: 0.19 wchar vs atch: 0.18 pgscan vs pflt: 0.18 wchar vs pgin: 0.18 scall vs pgscan: 0.18 pgin vs pflt: 0.18 rchar vs atch: 0.17 sread vs runqsz_CPU_Bound: 0.17 exec vs rchar: 0.17 fork vs pgfree: 0.17 fork vs ppgout: 0.17 lread vs vflt: 0.17 freemem vs runqsz_Not_CPU_Bound: 0.16 wchar vs runqsz_CPU_Bound: 0.16 sread vs exec: 0.16 fork vs pgin: 0.16 lread vs ppgin: 0.16 fork vs pgscan: 0.16 swrite vs ppgout: 0.16 wchar vs pgfree: 0.16 swrite vs pgout: 0.15 pgout vs pflt: 0.15 ppgin vs pflt: 0.15 exec vs ppgin: 0.15 exec vs ppgout: 0.15 pgout vs atch: 0.15 swrite vs pgin: 0.15 exec vs pgfree: 0.15 swrite vs pgfree: 0.15 exec vs pgscan: 0.14 swrite vs ppgin: 0.14 lwrite vs scall: 0.14 lread vs fork: 0.14 lread vs pflt: 0.14 lread vs sread: 0.13 swrite vs runqsz_CPU_Bound: 0.13 fork vs ppgin: 0.13 lread vs ppgout: 0.13 fork vs pgout: 0.13 lwrite vs sread: 0.13 vflt vs runqsz_CPU_Bound: 0.12 swrite vs pgscan: 0.12 lread vs swrite: 0.12 pflt vs runqsz_CPU_Bound: 0.12 lwrite vs rchar: 0.12 lread vs pgfree: 0.11 wchar vs pgscan: 0.11 exec vs pgout: 0.11 wchar vs vflt: 0.11 lread vs exec: 0.11 lread vs rchar: 0.11 swrite vs exec: 0.10 lwrite vs swrite: 0.10 atch vs vflt: 0.10 lwrite vs vflt: 0.09 ppgout vs atch: 0.09 fork vs runqsz_CPU_Bound: 0.09 lwrite vs wchar: 0.09 lwrite vs pgin: 0.09 lwrite vs ppgin: 0.09 lread vs pgscan: 0.09 wchar vs pflt: 0.09 sread vs atch: 0.09 lread vs pgout: 0.08 lread vs wchar: 0.08 lwrite vs ppgout: 0.08 scall vs atch: 0.08 pgin vs runqsz_CPU_Bound: 0.07 ppgin vs runqsz_CPU_Bound: 0.07 pgfree vs atch: 0.07 lread vs runqsz_CPU_Bound: 0.07 lwrite vs pflt: 0.07 lwrite vs pgout: 0.07 freeswap vs runqsz_Not_CPU_Bound: 0.07 lwrite vs pgfree: 0.07 swrite vs atch: 0.06 fork vs wchar: 0.06 atch vs pgin: 0.06 atch vs ppgin: 0.06 lwrite vs fork: 0.05 atch vs runqsz_CPU_Bound: 0.05 exec vs atch: 0.05 atch vs pflt: 0.05 lwrite vs runqsz_CPU_Bound: 0.05 exec vs runqsz_CPU_Bound: 0.05 fork vs atch: 0.05 pgfree vs runqsz_CPU_Bound: 0.04 lwrite vs pgscan: 0.04 pgscan vs atch: 0.04 lwrite vs exec: 0.04 pgscan vs runqsz_CPU_Bound: 0.04 ppgout vs runqsz_CPU_Bound: 0.04 lwrite vs atch: 0.03 pgout vs runqsz_CPU_Bound: 0.02 lread vs atch: 0.02 exec vs wchar: 0.00 pgout vs runqsz_Not_CPU_Bound: -0.02 ppgout vs runqsz_Not_CPU_Bound: -0.04 pgscan vs runqsz_Not_CPU_Bound: -0.04 pgfree vs runqsz_Not_CPU_Bound: -0.04 exec vs runqsz_Not_CPU_Bound: -0.05 lwrite vs runqsz_Not_CPU_Bound: -0.05 atch vs runqsz_Not_CPU_Bound: -0.05 freeswap vs runqsz_CPU_Bound: -0.07 lread vs runqsz_Not_CPU_Bound: -0.07 ppgin vs runqsz_Not_CPU_Bound: -0.07 pgin vs runqsz_Not_CPU_Bound: -0.07 lread vs freeswap: -0.08 lread vs freemem: -0.08 atch vs freemem: -0.09 lwrite vs freemem: -0.09 fork vs runqsz_Not_CPU_Bound: -0.09 lwrite vs usr: -0.11 pflt vs freemem: -0.11 pflt vs runqsz_Not_CPU_Bound: -0.12 lwrite vs freeswap: -0.12 vflt vs runqsz_Not_CPU_Bound: -0.12 atch vs freeswap: -0.12 fork vs freemem: -0.12 atch vs usr: -0.13 fork vs freeswap: -0.13 pflt vs freeswap: -0.13 swrite vs runqsz_Not_CPU_Bound: -0.13 lread vs usr: -0.14 wchar vs freemem: -0.15 rchar vs freemem: -0.15 exec vs freeswap: -0.15 exec vs freemem: -0.16 wchar vs runqsz_Not_CPU_Bound: -0.16 freemem vs runqsz_CPU_Bound: -0.16 sread vs runqsz_Not_CPU_Bound: -0.17 pgscan vs freeswap: -0.18 pgscan vs usr: -0.18 pgscan vs freemem: -0.19 rchar vs runqsz_Not_CPU_Bound: -0.19 vflt vs freemem: -0.20 pgfree vs freeswap: -0.21 ppgout vs usr: -0.21 ppgout vs freeswap: -0.21 ppgin vs freemem: -0.22 pgfree vs usr: -0.22 rchar vs freeswap: -0.22 pgout vs usr: -0.22 wchar vs freeswap: -0.23 pgin vs freemem: -0.23 ppgin vs usr: -0.23 pgfree vs freemem: -0.23 swrite vs freeswap: -0.24 pgin vs usr: -0.24 scall vs runqsz_Not_CPU_Bound: -0.24 pgout vs freeswap: -0.25 vflt vs freeswap: -0.25 ppgout vs freemem: -0.25 swrite vs freemem: -0.25 ppgin vs freeswap: -0.25 usr vs runqsz_CPU_Bound: -0.26 pgout vs freemem: -0.27 swrite vs usr: -0.27 pgin vs freeswap: -0.28 sread vs freemem: -0.29 exec vs usr: -0.29 wchar vs usr: -0.29 sread vs freeswap: -0.30 scall vs usr: -0.32 rchar vs usr: -0.33 sread vs usr: -0.33 scall vs freeswap: -0.35 fork vs usr: -0.36 pflt vs usr: -0.37 scall vs freemem: -0.39 vflt vs usr: -0.42 runqsz_CPU_Bound vs runqsz_Not_CPU_Bound: -1.00
Problem 1 - Data Pre-processing¶
In [24]:
df.isnull().sum()
Out[24]:
lread 0 lwrite 0 scall 0 sread 0 swrite 0 fork 0 exec 0 rchar 104 wchar 15 pgout 0 ppgout 0 pgfree 0 pgscan 0 atch 0 pgin 0 ppgin 0 pflt 0 vflt 0 runqsz 0 freemem 0 freeswap 0 usr 0 dtype: int64
In [25]:
#impute null values
df['rchar']=df['rchar'].fillna(df['rchar'].median())
df['wchar']=df['wchar'].fillna(df['wchar'].median())
In [26]:
df.isnull().sum()
Out[26]:
lread 0 lwrite 0 scall 0 sread 0 swrite 0 fork 0 exec 0 rchar 0 wchar 0 pgout 0 ppgout 0 pgfree 0 pgscan 0 atch 0 pgin 0 ppgin 0 pflt 0 vflt 0 runqsz 0 freemem 0 freeswap 0 usr 0 dtype: int64
In [27]:
def remove_outlier(col):
col = sorted(col)
Q1, Q3 = np.percentile(col, [25, 75])
IQR = Q3 - Q1
lower_range = Q1 - (1.5 * IQR)
upper_range = Q3 + (1.5 * IQR)
return lower_range, upper_range
# List to store the lower and upper ranges for each column
ranges = []
# Iterate through numerical columns and remove outliers
for num_col in num_columns:
lr, ur = remove_outlier(df[num_col])
print(f"For {num_col}, lower range is {lr} and upper range is {ur}")
# Store the ranges for reference
ranges.append((num_col, lr, ur))
# Remove outliers in the dataframe
df[num_col] = np.where(df[num_col] > ur, ur, df[num_col])
df[num_col] = np.where(df[num_col] < lr, lr, df[num_col])
# Print the dataframe after removing outliers
print(df.head())
For lread, lower range is -25.0 and upper range is 47.0
For lwrite, lower range is -15.0 and upper range is 25.0
For scall, lower range is -2445.875 and upper range is 6775.125
For sread, lower range is -203.5 and upper range is 568.5
For swrite, lower range is -120.0 and upper range is 368.0
For fork, lower range is -2.3000000000000003 and upper range is 4.9
For exec, lower range is -3.6999999999999993 and upper range is 6.699999999999999
For rchar, lower range is -310940.875 and upper range is 611196.125
For wchar, lower range is -101611.125 and upper range is 230625.875
For pgout, lower range is -3.5999999999999996 and upper range is 6.0
For ppgout, lower range is -6.300000000000001 and upper range is 10.5
For pgfree, lower range is -7.5 and upper range is 12.5
For pgscan, lower range is 0.0 and upper range is 0.0
For atch, lower range is -0.8999999999999999 and upper range is 1.5
For pgin, lower range is -13.147500000000003 and upper range is 23.512500000000003
For ppgin, lower range is -19.2 and upper range is 33.6
For pflt, lower range is -176.89999999999998 and upper range is 361.5
For vflt, lower range is -264.20000000000005 and upper range is 561.4000000000001
For freemem, lower range is -2425.875 and upper range is 4659.125
For freeswap, lower range is 10989.5 and upper range is 2762013.5
For usr, lower range is 61.5 and upper range is 113.5
lread lwrite scall sread swrite fork exec rchar wchar pgout \
0 1.0 0.0 2147.0 79.0 68.0 0.2 0.2 40671.0 53995.0 0.0
1 0.0 0.0 170.0 18.0 21.0 0.2 0.2 448.0 8385.0 0.0
2 15.0 3.0 2162.0 159.0 119.0 2.0 2.4 125473.5 31950.0 0.0
3 0.0 0.0 160.0 12.0 16.0 0.2 0.2 125473.5 8670.0 0.0
4 5.0 1.0 330.0 39.0 38.0 0.4 0.4 125473.5 12185.0 0.0
... pgscan atch pgin ppgin pflt vflt runqsz freemem \
0 ... 0.0 0.0 1.6 2.6 16.00 26.40 CPU_Bound 4659.125
1 ... 0.0 0.0 0.0 0.0 15.63 16.83 Not_CPU_Bound 4659.125
2 ... 0.0 1.2 6.0 9.4 150.20 220.20 Not_CPU_Bound 702.000
3 ... 0.0 0.0 0.2 0.2 15.60 16.80 Not_CPU_Bound 4659.125
4 ... 0.0 0.0 1.0 1.2 37.80 47.60 Not_CPU_Bound 633.000
freeswap usr
0 1730946.0 95.0
1 1869002.0 97.0
2 1021237.0 87.0
3 1863704.0 98.0
4 1760253.0 90.0
[5 rows x 22 columns]
In [28]:
# Set up subplot for boxplot
fig, axes = plt.subplots(figsize=(20,16))
# Draw boxplot for all numerical columns
sns.boxplot(data=df[num_columns], ax=axes)
axes.set_title('Boxplot for Numerical Columns')
# Show the boxplot
plt.show()
In [29]:
#encoding data
df = pd.get_dummies(df, columns=['runqsz'], drop_first=True)
df
Out[29]:
| lread | lwrite | scall | sread | swrite | fork | exec | rchar | wchar | pgout | ... | pgscan | atch | pgin | ppgin | pflt | vflt | freemem | freeswap | usr | runqsz_Not_CPU_Bound | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 0.0 | 2147.0 | 79.0 | 68.0 | 0.2 | 0.20 | 40671.0 | 53995.0 | 0.0 | ... | 0.0 | 0.0 | 1.6000 | 2.60 | 16.00 | 26.40 | 4659.125 | 1730946.0 | 95.0 | False |
| 1 | 0.0 | 0.0 | 170.0 | 18.0 | 21.0 | 0.2 | 0.20 | 448.0 | 8385.0 | 0.0 | ... | 0.0 | 0.0 | 0.0000 | 0.00 | 15.63 | 16.83 | 4659.125 | 1869002.0 | 97.0 | True |
| 2 | 15.0 | 3.0 | 2162.0 | 159.0 | 119.0 | 2.0 | 2.40 | 125473.5 | 31950.0 | 0.0 | ... | 0.0 | 1.2 | 6.0000 | 9.40 | 150.20 | 220.20 | 702.000 | 1021237.0 | 87.0 | True |
| 3 | 0.0 | 0.0 | 160.0 | 12.0 | 16.0 | 0.2 | 0.20 | 125473.5 | 8670.0 | 0.0 | ... | 0.0 | 0.0 | 0.2000 | 0.20 | 15.60 | 16.80 | 4659.125 | 1863704.0 | 98.0 | True |
| 4 | 5.0 | 1.0 | 330.0 | 39.0 | 38.0 | 0.4 | 0.40 | 125473.5 | 12185.0 | 0.0 | ... | 0.0 | 0.0 | 1.0000 | 1.20 | 37.80 | 47.60 | 633.000 | 1760253.0 | 90.0 | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8187 | 16.0 | 12.0 | 3009.0 | 360.0 | 244.0 | 1.6 | 5.81 | 405250.0 | 85282.0 | 6.0 | ... | 0.0 | 0.6 | 23.5125 | 33.60 | 139.28 | 270.74 | 387.000 | 986647.0 | 80.0 | False |
| 8188 | 4.0 | 0.0 | 1596.0 | 170.0 | 146.0 | 2.4 | 1.80 | 89489.0 | 41764.0 | 3.8 | ... | 0.0 | 0.8 | 3.8000 | 4.40 | 122.40 | 212.60 | 263.000 | 1055742.0 | 90.0 | True |
| 8189 | 16.0 | 5.0 | 3116.0 | 289.0 | 190.0 | 0.6 | 0.60 | 325948.0 | 52640.0 | 0.4 | ... | 0.0 | 0.4 | 23.5125 | 33.60 | 60.20 | 219.80 | 400.000 | 969106.0 | 87.0 | True |
| 8190 | 32.0 | 25.0 | 5180.0 | 254.0 | 179.0 | 1.2 | 1.20 | 62571.0 | 29505.0 | 1.4 | ... | 0.0 | 0.4 | 23.0500 | 24.25 | 93.19 | 202.81 | 141.000 | 1022458.0 | 83.0 | False |
| 8191 | 2.0 | 0.0 | 985.0 | 55.0 | 46.0 | 1.6 | 4.80 | 111111.0 | 22256.0 | 0.0 | ... | 0.0 | 0.2 | 3.4000 | 6.20 | 91.80 | 110.00 | 659.000 | 1756514.0 | 94.0 | False |
8192 rows × 22 columns
In [30]:
df['runqsz_Not_CPU_Bound'] = df['runqsz_Not_CPU_Bound'].astype('uint8')
In [31]:
#Getting train and test variables
# independent variables
X = df.drop(["usr"], axis=1)
# dependent variable
y = df[["usr"]]
In [32]:
# let's add the intercept to data
import statsmodels.api as sm
X = sm.add_constant(X)
In [33]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8192 entries, 0 to 8191 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 lread 8192 non-null float64 1 lwrite 8192 non-null float64 2 scall 8192 non-null float64 3 sread 8192 non-null float64 4 swrite 8192 non-null float64 5 fork 8192 non-null float64 6 exec 8192 non-null float64 7 rchar 8192 non-null float64 8 wchar 8192 non-null float64 9 pgout 8192 non-null float64 10 ppgout 8192 non-null float64 11 pgfree 8192 non-null float64 12 pgscan 8192 non-null float64 13 atch 8192 non-null float64 14 pgin 8192 non-null float64 15 ppgin 8192 non-null float64 16 pflt 8192 non-null float64 17 vflt 8192 non-null float64 18 freemem 8192 non-null float64 19 freeswap 8192 non-null float64 20 usr 8192 non-null float64 21 runqsz_Not_CPU_Bound 8192 non-null uint8 dtypes: float64(21), uint8(1) memory usage: 1.3 MB
In [34]:
#We will now split X and y into train and test sets in a 70:30 ratio.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.30, random_state=1
)
In [35]:
#Fit Linear Model
olsmod = sm.OLS(y_train, X_train)
olsres = olsmod.fit()
In [36]:
# let's print the regression summary
print(olsres.summary())
OLS Regression Results
==============================================================================
Dep. Variable: usr R-squared: 0.796
Model: OLS Adj. R-squared: 0.795
Method: Least Squares F-statistic: 1115.
Date: Thu, 11 Jan 2024 Prob (F-statistic): 0.00
Time: 07:37:00 Log-Likelihood: -16657.
No. Observations: 5734 AIC: 3.336e+04
Df Residuals: 5713 BIC: 3.350e+04
Df Model: 20
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
const 84.1217 0.316 266.106 0.000 83.502 84.741
lread -0.0635 0.009 -7.071 0.000 -0.081 -0.046
lwrite 0.0482 0.013 3.671 0.000 0.022 0.074
scall -0.0007 6.28e-05 -10.566 0.000 -0.001 -0.001
sread 0.0003 0.001 0.305 0.760 -0.002 0.002
swrite -0.0054 0.001 -3.777 0.000 -0.008 -0.003
fork 0.0293 0.132 0.222 0.824 -0.229 0.288
exec -0.3212 0.052 -6.220 0.000 -0.422 -0.220
rchar -5.167e-06 4.88e-07 -10.598 0.000 -6.12e-06 -4.21e-06
wchar -5.403e-06 1.03e-06 -5.232 0.000 -7.43e-06 -3.38e-06
pgout -0.3688 0.090 -4.098 0.000 -0.545 -0.192
ppgout -0.0766 0.079 -0.973 0.330 -0.231 0.078
pgfree 0.0845 0.048 1.769 0.077 -0.009 0.178
pgscan 5.192e-14 2.39e-16 216.826 0.000 5.15e-14 5.24e-14
atch 0.6276 0.143 4.394 0.000 0.348 0.908
pgin 0.0200 0.028 0.703 0.482 -0.036 0.076
ppgin -0.0673 0.020 -3.415 0.001 -0.106 -0.029
pflt -0.0336 0.002 -16.957 0.000 -0.037 -0.030
vflt -0.0055 0.001 -3.830 0.000 -0.008 -0.003
freemem -0.0005 5.07e-05 -9.038 0.000 -0.001 -0.000
freeswap 8.832e-06 1.9e-07 46.472 0.000 8.46e-06 9.2e-06
runqsz_Not_CPU_Bound 1.6153 0.126 12.819 0.000 1.368 1.862
==============================================================================
Omnibus: 1103.645 Durbin-Watson: 2.016
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2372.553
Skew: -1.119 Prob(JB): 0.00
Kurtosis: 5.219 Cond. No. 2.92e+22
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.34e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [37]:
# let's check the VIF of the predictors
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_series1 = pd.Series(
[variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])],
index=X_train.columns,
)
print("VIF values: \n\n{}\n".format(vif_series1))
VIF values: const 29.229332 lread 5.350560 lwrite 4.328397 scall 2.960609 sread 6.420172 swrite 5.597135 fork 13.035359 exec 3.241417 rchar 2.133616 wchar 1.584381 pgout 11.360363 ppgout 29.404223 pgfree 16.496748 pgscan NaN atch 1.875901 pgin 13.809339 ppgin 13.951855 pflt 12.001460 vflt 15.971049 freemem 1.961304 freeswap 1.841239 runqsz_Not_CPU_Bound 1.156815 dtype: float64
In [38]:
variables_to_drop = ["atch", "pgin", "ppgin", "pflt", "vflt", "pgfree", "ppgout", "pgout", "fork", "exec", "swrite", "sread", "lwrite", "lread", "pgscan", "runqsz_Not_CPU_Bound"]
for variable in variables_to_drop:
X_train_temp = X_train.drop([variable], axis=1)
ols_model_temp = sm.OLS(y_train, X_train_temp)
ols_res_temp = ols_model_temp.fit()
r_squared_diff = 0.795 - ols_res_temp.rsquared_adj
print(f"On dropping '{variable}', Adjusted R-squared minus Adjusted R-squared is {np.round(r_squared_diff, 3)}")
print(f"R-squared: {np.round(ols_res_temp.rsquared, 3)}\nAdjusted R-squared: {np.round(ols_res_temp.rsquared_adj, 3)}\n")
On dropping 'atch', Adjusted R-squared minus Adjusted R-squared is 0.0 R-squared: 0.795 Adjusted R-squared: 0.795 On dropping 'pgin', Adjusted R-squared minus Adjusted R-squared is -0.0 R-squared: 0.796 Adjusted R-squared: 0.795 On dropping 'ppgin', Adjusted R-squared minus Adjusted R-squared is -0.0 R-squared: 0.796 Adjusted R-squared: 0.795 On dropping 'pflt', Adjusted R-squared minus Adjusted R-squared is 0.01 R-squared: 0.786 Adjusted R-squared: 0.785 On dropping 'vflt', Adjusted R-squared minus Adjusted R-squared is 0.0 R-squared: 0.796 Adjusted R-squared: 0.795 On dropping 'pgfree', Adjusted R-squared minus Adjusted R-squared is -0.0 R-squared: 0.796 Adjusted R-squared: 0.795 On dropping 'ppgout', Adjusted R-squared minus Adjusted R-squared is -0.0 R-squared: 0.796 Adjusted R-squared: 0.795 On dropping 'pgout', Adjusted R-squared minus Adjusted R-squared is 0.0 R-squared: 0.796 Adjusted R-squared: 0.795 On dropping 'fork', Adjusted R-squared minus Adjusted R-squared is -0.0 R-squared: 0.796 Adjusted R-squared: 0.795 On dropping 'exec', Adjusted R-squared minus Adjusted R-squared is 0.001 R-squared: 0.795 Adjusted R-squared: 0.794 On dropping 'swrite', Adjusted R-squared minus Adjusted R-squared is 0.0 R-squared: 0.796 Adjusted R-squared: 0.795 On dropping 'sread', Adjusted R-squared minus Adjusted R-squared is -0.0 R-squared: 0.796 Adjusted R-squared: 0.795 On dropping 'lwrite', Adjusted R-squared minus Adjusted R-squared is 0.0 R-squared: 0.796 Adjusted R-squared: 0.795 On dropping 'lread', Adjusted R-squared minus Adjusted R-squared is 0.001 R-squared: 0.794 Adjusted R-squared: 0.794 On dropping 'pgscan', Adjusted R-squared minus Adjusted R-squared is -0.0 R-squared: 0.796 Adjusted R-squared: 0.795 On dropping 'runqsz_Not_CPU_Bound', Adjusted R-squared minus Adjusted R-squared is 0.005 R-squared: 0.79 Adjusted R-squared: 0.79
Since there is a very small effect (0.001) on adj. R-squared after dropping the list in variable "variables_to_drop", we can remove it from the training set.
In [39]:
X_train = X_train.drop(["pgout", "ppgin", "ppgout", "pgscan", "fork"], axis=1)
In [40]:
olsmod_2 = sm.OLS(y_train, X_train)
olsres_2 = olsmod_2.fit()
print(olsres_2.summary())
OLS Regression Results
==============================================================================
Dep. Variable: usr R-squared: 0.794
Model: OLS Adj. R-squared: 0.794
Method: Least Squares F-statistic: 1380.
Date: Thu, 11 Jan 2024 Prob (F-statistic): 0.00
Time: 07:37:00 Log-Likelihood: -16683.
No. Observations: 5734 AIC: 3.340e+04
Df Residuals: 5717 BIC: 3.351e+04
Df Model: 16
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
const 84.1524 0.314 268.183 0.000 83.537 84.768
lread -0.0647 0.009 -7.201 0.000 -0.082 -0.047
lwrite 0.0489 0.013 3.718 0.000 0.023 0.075
scall -0.0007 6.26e-05 -10.742 0.000 -0.001 -0.001
sread 0.0002 0.001 0.225 0.822 -0.002 0.002
swrite -0.0052 0.001 -3.668 0.000 -0.008 -0.002
exec -0.3086 0.050 -6.213 0.000 -0.406 -0.211
rchar -5.297e-06 4.85e-07 -10.911 0.000 -6.25e-06 -4.34e-06
wchar -5.799e-06 1.03e-06 -5.607 0.000 -7.83e-06 -3.77e-06
pgfree -0.1109 0.016 -6.761 0.000 -0.143 -0.079
atch 0.4034 0.138 2.921 0.003 0.133 0.674
pgin -0.0738 0.010 -7.483 0.000 -0.093 -0.054
pflt -0.0337 0.002 -18.277 0.000 -0.037 -0.030
vflt -0.0051 0.001 -4.023 0.000 -0.008 -0.003
freemem -0.0004 5.08e-05 -8.749 0.000 -0.001 -0.000
freeswap 8.82e-06 1.89e-07 46.581 0.000 8.45e-06 9.19e-06
runqsz_Not_CPU_Bound 1.5596 0.126 12.354 0.000 1.312 1.807
==============================================================================
Omnibus: 1086.134 Durbin-Watson: 2.015
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2304.413
Skew: -1.108 Prob(JB): 0.00
Kurtosis: 5.176 Cond. No. 7.64e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.64e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
In [41]:
vif_series2 = pd.Series(
[variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])],
index=X_train.columns,
)
print("VIF values: \n\n{}\n".format(vif_series2))
VIF values: const 28.561995 lread 5.312644 lwrite 4.305704 scall 2.913380 sread 6.402072 swrite 5.422510 exec 2.975076 rchar 2.098192 wchar 1.576027 pgfree 1.930990 atch 1.738475 pgin 1.648652 pflt 10.324080 vflt 12.356350 freemem 1.953053 freeswap 1.812799 runqsz_Not_CPU_Bound 1.151692 dtype: float64
In [42]:
variables_to_drop2 = ["swrite", "sread", "lread", "lwrite", "pflt", "vflt", "atch", "exec"]
for variable in variables_to_drop2:
X_train_temp = X_train.drop([variable], axis=1)
ols_model_temp = sm.OLS(y_train, X_train_temp)
ols_res_temp = ols_model_temp.fit()
r_squared_diff = 0.795 - ols_res_temp.rsquared_adj
print(f"On dropping '{variable}', Adjusted R-squared minus Adjusted R-squared is {np.round(r_squared_diff, 3)}")
print(f"R-squared: {np.round(ols_res_temp.rsquared, 3)}\nAdjusted R-squared: {np.round(ols_res_temp.rsquared_adj, 3)}\n")
On dropping 'swrite', Adjusted R-squared minus Adjusted R-squared is 0.002 R-squared: 0.794 Adjusted R-squared: 0.793 On dropping 'sread', Adjusted R-squared minus Adjusted R-squared is 0.001 R-squared: 0.794 Adjusted R-squared: 0.794 On dropping 'lread', Adjusted R-squared minus Adjusted R-squared is 0.003 R-squared: 0.792 Adjusted R-squared: 0.792 On dropping 'lwrite', Adjusted R-squared minus Adjusted R-squared is 0.002 R-squared: 0.794 Adjusted R-squared: 0.793 On dropping 'pflt', Adjusted R-squared minus Adjusted R-squared is 0.013 R-squared: 0.782 Adjusted R-squared: 0.782 On dropping 'vflt', Adjusted R-squared minus Adjusted R-squared is 0.002 R-squared: 0.794 Adjusted R-squared: 0.793 On dropping 'atch', Adjusted R-squared minus Adjusted R-squared is 0.002 R-squared: 0.794 Adjusted R-squared: 0.793 On dropping 'exec', Adjusted R-squared minus Adjusted R-squared is 0.003 R-squared: 0.793 Adjusted R-squared: 0.792
In [43]:
X_train = X_train.drop(["vflt", "swrite", "sread"], axis=1)
In [44]:
olsmod_3 = sm.OLS(y_train, X_train)
olsres_3 = olsmod_3.fit()
print(olsres_3.summary())
OLS Regression Results
==============================================================================
Dep. Variable: usr R-squared: 0.793
Model: OLS Adj. R-squared: 0.792
Method: Least Squares F-statistic: 1682.
Date: Thu, 11 Jan 2024 Prob (F-statistic): 0.00
Time: 07:37:01 Log-Likelihood: -16705.
No. Observations: 5734 AIC: 3.344e+04
Df Residuals: 5720 BIC: 3.353e+04
Df Model: 13
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
const 83.8075 0.310 270.281 0.000 83.200 84.415
lread -0.0681 0.009 -7.581 0.000 -0.086 -0.050
lwrite 0.0519 0.013 3.943 0.000 0.026 0.078
scall -0.0009 4.86e-05 -17.694 0.000 -0.001 -0.001
exec -0.2682 0.046 -5.779 0.000 -0.359 -0.177
rchar -5.509e-06 4.35e-07 -12.670 0.000 -6.36e-06 -4.66e-06
wchar -6.75e-06 9.83e-07 -6.868 0.000 -8.68e-06 -4.82e-06
pgfree -0.1170 0.016 -7.134 0.000 -0.149 -0.085
atch 0.3907 0.138 2.825 0.005 0.120 0.662
pgin -0.0853 0.010 -8.961 0.000 -0.104 -0.067
pflt -0.0420 0.001 -42.696 0.000 -0.044 -0.040
freemem -0.0004 5.07e-05 -8.351 0.000 -0.001 -0.000
freeswap 8.984e-06 1.87e-07 47.951 0.000 8.62e-06 9.35e-06
runqsz_Not_CPU_Bound 1.5512 0.127 12.243 0.000 1.303 1.800
==============================================================================
Omnibus: 991.432 Durbin-Watson: 2.012
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2024.692
Skew: -1.034 Prob(JB): 0.00
Kurtosis: 5.048 Cond. No. 7.53e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.53e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
In [45]:
vif_series3 = pd.Series(
[variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])],
index=X_train.columns,
)
print("VIF values: \n\n{}\n".format(vif_series3))
VIF values: const 27.685755 lread 5.273659 lwrite 4.286869 scall 1.742585 exec 2.577562 rchar 1.670597 wchar 1.412736 pgfree 1.917857 atch 1.731517 pgin 1.524691 pflt 2.914826 freemem 1.930797 freeswap 1.761721 runqsz_Not_CPU_Bound 1.151418 dtype: float64
In [ ]:
In [46]:
variables_to_drop3 = ["lwrite", "exec", "pflt", "lread", "atch"]
for variable in variables_to_drop3:
X_train_temp = X_train.drop([variable], axis=1)
ols_model_temp = sm.OLS(y_train, X_train_temp)
ols_res_temp = ols_model_temp.fit()
r_squared_diff = 0.795 - ols_res_temp.rsquared_adj
print(f"On dropping '{variable}', Adjusted R-squared minus Adjusted R-squared is {np.round(r_squared_diff, 3)}")
print(f"R-squared: {np.round(ols_res_temp.rsquared, 3)}\nAdjusted R-squared: {np.round(ols_res_temp.rsquared_adj, 3)}\n")
On dropping 'lwrite', Adjusted R-squared minus Adjusted R-squared is 0.003 R-squared: 0.792 Adjusted R-squared: 0.792 On dropping 'exec', Adjusted R-squared minus Adjusted R-squared is 0.004 R-squared: 0.791 Adjusted R-squared: 0.791 On dropping 'pflt', Adjusted R-squared minus Adjusted R-squared is 0.069 R-squared: 0.727 Adjusted R-squared: 0.726 On dropping 'lread', Adjusted R-squared minus Adjusted R-squared is 0.005 R-squared: 0.791 Adjusted R-squared: 0.79 On dropping 'atch', Adjusted R-squared minus Adjusted R-squared is 0.003 R-squared: 0.792 Adjusted R-squared: 0.792
In [47]:
X_train = X_train.drop(["exec", "lwrite"], axis=1)
In [ ]:
In [48]:
olsmod_4 = sm.OLS(y_train, X_train)
olsres_4 = olsmod_4.fit()
print(olsres_4.summary())
OLS Regression Results
==============================================================================
Dep. Variable: usr R-squared: 0.791
Model: OLS Adj. R-squared: 0.790
Method: Least Squares F-statistic: 1966.
Date: Thu, 11 Jan 2024 Prob (F-statistic): 0.00
Time: 07:37:01 Log-Likelihood: -16731.
No. Observations: 5734 AIC: 3.349e+04
Df Residuals: 5722 BIC: 3.357e+04
Df Model: 11
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
const 83.8133 0.311 269.545 0.000 83.204 84.423
lread -0.0396 0.004 -8.900 0.000 -0.048 -0.031
scall -0.0009 4.87e-05 -18.200 0.000 -0.001 -0.001
rchar -5.528e-06 4.37e-07 -12.660 0.000 -6.38e-06 -4.67e-06
wchar -6.343e-06 9.84e-07 -6.445 0.000 -8.27e-06 -4.41e-06
pgfree -0.1202 0.016 -7.298 0.000 -0.152 -0.088
atch 0.3536 0.139 2.550 0.011 0.082 0.625
pgin -0.0957 0.009 -10.144 0.000 -0.114 -0.077
pflt -0.0467 0.001 -64.433 0.000 -0.048 -0.045
freemem -0.0004 5.09e-05 -8.116 0.000 -0.001 -0.000
freeswap 8.997e-06 1.88e-07 47.815 0.000 8.63e-06 9.37e-06
runqsz_Not_CPU_Bound 1.6020 0.127 12.627 0.000 1.353 1.851
==============================================================================
Omnibus: 943.471 Durbin-Watson: 2.014
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1880.411
Skew: -0.999 Prob(JB): 0.00
Kurtosis: 4.969 Cond. No. 7.52e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.52e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
In [49]:
vif_series4 = pd.Series(
[variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])],
index=X_train.columns,
)
print("VIF values: \n\n{}\n".format(vif_series4))
VIF values: const 27.599007 lread 1.284916 scall 1.732868 rchar 1.670331 wchar 1.404285 pgfree 1.915050 atch 1.725226 pgin 1.484113 pflt 1.567345 freemem 1.929209 freeswap 1.761537 runqsz_Not_CPU_Bound 1.144543 dtype: float64
In [ ]:
Assumptions of Linear Regression¶
In [50]:
df_pred = pd.DataFrame()
df_pred["Actual Values"] = y_train.values.flatten() # actual values
df_pred["Fitted Values"] = olsres_4.fittedvalues.values # predicted values
df_pred["Residuals"] = olsres_4.resid.values # residuals
df_pred.head()
Out[50]:
| Actual Values | Fitted Values | Residuals | |
|---|---|---|---|
| 0 | 91.0 | 89.693813 | 1.306187 |
| 1 | 94.0 | 91.604172 | 2.395828 |
| 2 | 61.5 | 74.803965 | -13.303965 |
| 3 | 83.0 | 80.799348 | 2.200652 |
| 4 | 94.0 | 98.105864 | -4.105864 |
In [51]:
# let us plot the fitted values vs residuals
sns.set_style("whitegrid")
sns.residplot(
data=df_pred, x="Fitted Values", y="Residuals", color="purple", lowess=True
)
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.title("Fitted vs Residual plot")
plt.show()
No pattern in the data thus the assumption of linearity and independence of predictors satisfied¶
In [52]:
# columns in training set
X_train.columns
Out[52]:
Index(['const', 'lread', 'scall', 'rchar', 'wchar', 'pgfree', 'atch', 'pgin',
'pflt', 'freemem', 'freeswap', 'runqsz_Not_CPU_Bound'],
dtype='object')
In [53]:
# checking the distribution of variables in training set with dependent variable
sns_plot = sns.pairplot(df[['usr', 'lread', 'scall', 'rchar', 'wchar', 'pgfree', 'atch', 'pgin',
'pflt', 'freemem', 'freeswap', 'runqsz_Not_CPU_Bound']])
sns_plot.figure.savefig("pairplot.png")
plt.show()
In [54]:
# using square transformation
X_train["scall_sq"] = np.square(X_train["scall"])
# let's create a model with the transformed data
olsmod_5 = sm.OLS(y_train, X_train)
olsres_5 = olsmod_5.fit()
print(olsres_5.summary())
OLS Regression Results
==============================================================================
Dep. Variable: usr R-squared: 0.799
Model: OLS Adj. R-squared: 0.799
Method: Least Squares F-statistic: 1895.
Date: Thu, 11 Jan 2024 Prob (F-statistic): 0.00
Time: 07:38:00 Log-Likelihood: -16616.
No. Observations: 5734 AIC: 3.326e+04
Df Residuals: 5721 BIC: 3.335e+04
Df Model: 12
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
const 80.9943 0.356 227.462 0.000 80.296 81.692
lread -0.0404 0.004 -9.253 0.000 -0.049 -0.032
scall 0.0011 0.000 7.979 0.000 0.001 0.001
rchar -5.472e-06 4.28e-07 -12.784 0.000 -6.31e-06 -4.63e-06
wchar -6.973e-06 9.66e-07 -7.222 0.000 -8.87e-06 -5.08e-06
pgfree -0.1095 0.016 -6.775 0.000 -0.141 -0.078
atch 0.3592 0.136 2.643 0.008 0.093 0.626
pgin -0.1083 0.009 -11.664 0.000 -0.127 -0.090
pflt -0.0476 0.001 -66.780 0.000 -0.049 -0.046
freemem -0.0003 5.08e-05 -5.359 0.000 -0.000 -0.000
freeswap 9.468e-06 1.87e-07 50.635 0.000 9.1e-06 9.83e-06
runqsz_Not_CPU_Bound 1.8196 0.125 14.537 0.000 1.574 2.065
scall_sq -3.192e-07 2.08e-08 -15.312 0.000 -3.6e-07 -2.78e-07
==============================================================================
Omnibus: 867.663 Durbin-Watson: 2.012
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1685.469
Skew: -0.938 Prob(JB): 0.00
Kurtosis: 4.881 Cond. No. 7.71e+07
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.71e+07. This might indicate that there are
strong multicollinearity or other numerical problems.
In [55]:
# using square transformation
X_train["pflt_sq"] = np.square(X_train["pflt"])
# let's create a model with the transformed data
olsmod_6 = sm.OLS(y_train, X_train)
olsres_6 = olsmod_6.fit()
print(olsres_6.summary())
OLS Regression Results
==============================================================================
Dep. Variable: usr R-squared: 0.803
Model: OLS Adj. R-squared: 0.802
Method: Least Squares F-statistic: 1788.
Date: Thu, 11 Jan 2024 Prob (F-statistic): 0.00
Time: 07:38:00 Log-Likelihood: -16565.
No. Observations: 5734 AIC: 3.316e+04
Df Residuals: 5720 BIC: 3.325e+04
Df Model: 13
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
const 80.3258 0.359 223.722 0.000 79.622 81.030
lread -0.0406 0.004 -9.385 0.000 -0.049 -0.032
scall 0.0009 0.000 6.575 0.000 0.001 0.001
rchar -5.569e-06 4.24e-07 -13.124 0.000 -6.4e-06 -4.74e-06
wchar -7.545e-06 9.59e-07 -7.870 0.000 -9.42e-06 -5.67e-06
pgfree -0.1040 0.016 -6.489 0.000 -0.135 -0.073
atch 0.3076 0.135 2.282 0.023 0.043 0.572
pgin -0.1148 0.009 -12.443 0.000 -0.133 -0.097
pflt -0.0271 0.002 -12.676 0.000 -0.031 -0.023
freemem -0.0003 5.03e-05 -5.551 0.000 -0.000 -0.000
freeswap 9.528e-06 1.85e-07 51.383 0.000 9.16e-06 9.89e-06
runqsz_Not_CPU_Bound 1.8713 0.124 15.070 0.000 1.628 2.115
scall_sq -2.861e-07 2.09e-08 -13.678 0.000 -3.27e-07 -2.45e-07
pflt_sq -6.019e-05 5.93e-06 -10.147 0.000 -7.18e-05 -4.86e-05
==============================================================================
Omnibus: 839.958 Durbin-Watson: 2.002
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1613.180
Skew: -0.916 Prob(JB): 0.00
Kurtosis: 4.843 Cond. No. 7.84e+07
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.84e+07. This might indicate that there are
strong multicollinearity or other numerical problems.
In [57]:
# let us recreate the dataframe with actual, fitted and residual values
df_pred = pd.DataFrame()
df_pred["Actual Values"] = y_train.values.flatten() # actual values
df_pred["Fitted Values"] = olsres_6.fittedvalues.values # predicted values
df_pred["Residuals"] = olsres_6.resid.values # residuals
df_pred.head()
Out[57]:
| Actual Values | Fitted Values | Residuals | |
|---|---|---|---|
| 0 | 91.0 | 89.130465 | 1.869535 |
| 1 | 94.0 | 91.341269 | 2.658731 |
| 2 | 61.5 | 74.962262 | -13.462262 |
| 3 | 83.0 | 81.713945 | 1.286055 |
| 4 | 94.0 | 97.136625 | -3.136625 |
In [58]:
# let us plot the fitted values vs residuals
sns.set_style("whitegrid")
sns.residplot(
data=df_pred, x="Fitted Values", y="Residuals", color="purple", lowess=True
)
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.title("Fitted vs Residual plot")
plt.show()
Test for Normality¶
In [59]:
sns.histplot(df_pred["Residuals"], kde=True)
plt.title("Normality of residuals")
plt.show()
In [60]:
w, p1_value = stats.shapiro(df_pred["Residuals"])
print('The p-value for Residuals is', p1_value)
alpha = 0.05 # Set your significance level
if p1_value > alpha:
print("The data appears to be normally distributed (fail to reject H0)")
else:
print("The data does not appear to be normally distributed (reject H0)")
The p-value for Residuals is 3.891742147061455e-39 The data does not appear to be normally distributed (reject H0)
In [61]:
import pylab
import scipy.stats as stats
stats.probplot(df_pred["Residuals"], dist="norm", plot=pylab)
plt.show()
In [62]:
X_train.skew()
Out[62]:
const 0.000000 lread 1.208855 scall 0.715733 rchar 1.119972 wchar 1.154196 pgfree 1.195839 atch 1.160107 pgin 1.239660 pflt 1.200329 freemem 1.196730 freeswap -0.779456 runqsz_Not_CPU_Bound -0.120234 scall_sq 1.922564 pflt_sq 2.019616 dtype: float64
Test for Homoscedasticity¶
The null and alternate hypotheses of the goldfeldquandt test are as follows:
- Null hypothesis : Residuals are homoscedastic
- Alternate hypothesis : Residuals have hetroscedasticity
In [63]:
import statsmodels.stats.api as sms
sms.het_goldfeldquandt(df_pred["Residuals"], X_train)[1]
Out[63]:
0.011332917693381926
Since p-value < 0.05 we can say that the residuals are hetroscedasticity.¶
In [ ]:
The model built olsmod_6 satisfies some assumptions of Linear Regression¶
In [64]:
olsres_6.summary()
Out[64]:
| Dep. Variable: | usr | R-squared: | 0.803 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.802 |
| Method: | Least Squares | F-statistic: | 1788. |
| Date: | Thu, 11 Jan 2024 | Prob (F-statistic): | 0.00 |
| Time: | 07:39:46 | Log-Likelihood: | -16565. |
| No. Observations: | 5734 | AIC: | 3.316e+04 |
| Df Residuals: | 5720 | BIC: | 3.325e+04 |
| Df Model: | 13 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 80.3258 | 0.359 | 223.722 | 0.000 | 79.622 | 81.030 |
| lread | -0.0406 | 0.004 | -9.385 | 0.000 | -0.049 | -0.032 |
| scall | 0.0009 | 0.000 | 6.575 | 0.000 | 0.001 | 0.001 |
| rchar | -5.569e-06 | 4.24e-07 | -13.124 | 0.000 | -6.4e-06 | -4.74e-06 |
| wchar | -7.545e-06 | 9.59e-07 | -7.870 | 0.000 | -9.42e-06 | -5.67e-06 |
| pgfree | -0.1040 | 0.016 | -6.489 | 0.000 | -0.135 | -0.073 |
| atch | 0.3076 | 0.135 | 2.282 | 0.023 | 0.043 | 0.572 |
| pgin | -0.1148 | 0.009 | -12.443 | 0.000 | -0.133 | -0.097 |
| pflt | -0.0271 | 0.002 | -12.676 | 0.000 | -0.031 | -0.023 |
| freemem | -0.0003 | 5.03e-05 | -5.551 | 0.000 | -0.000 | -0.000 |
| freeswap | 9.528e-06 | 1.85e-07 | 51.383 | 0.000 | 9.16e-06 | 9.89e-06 |
| runqsz_Not_CPU_Bound | 1.8713 | 0.124 | 15.070 | 0.000 | 1.628 | 2.115 |
| scall_sq | -2.861e-07 | 2.09e-08 | -13.678 | 0.000 | -3.27e-07 | -2.45e-07 |
| pflt_sq | -6.019e-05 | 5.93e-06 | -10.147 | 0.000 | -7.18e-05 | -4.86e-05 |
| Omnibus: | 839.958 | Durbin-Watson: | 2.002 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 1613.180 |
| Skew: | -0.916 | Prob(JB): | 0.00 |
| Kurtosis: | 4.843 | Cond. No. | 7.84e+07 |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.84e+07. This might indicate that there are
strong multicollinearity or other numerical problems.
The model equation will be as follows:¶
In [65]:
# Let us write the equation of linear regression
Equation = "usr ="
print(Equation, end=" ")
for i in range(len(X_train.columns)):
if i == 0:
print(olsres_6.params[i], "+", end=" ")
elif i != len(X_train.columns) - 1:
print(
olsres_6.params[i],
"* (",
X_train.columns[i],
")",
"+",
end=" ",
)
else:
print(olsres_6.params[i], "* (", X_train.columns[i], ")")
usr = 80.32578269641903 + -0.04060862245726409 * ( lread ) + 0.0009109432268011643 * ( scall ) + -5.5691044371107365e-06 * ( rchar ) + -7.5446719419862535e-06 * ( wchar ) + -0.10398102553887754 * ( pgfree ) + 0.30760866082751137 * ( atch ) + -0.1148167884326387 * ( pgin ) + -0.0271225198213273 * ( pflt ) + -0.0002793470072376525 * ( freemem ) + 9.5281546836217e-06 * ( freeswap ) + 1.871281340030479 * ( runqsz_Not_CPU_Bound ) + -2.8613266551192783e-07 * ( scall_sq ) + -6.018855684078149e-05 * ( pflt_sq )
Predictions¶
In [66]:
X_train.columns
Out[66]:
Index(['const', 'lread', 'scall', 'rchar', 'wchar', 'pgfree', 'atch', 'pgin',
'pflt', 'freemem', 'freeswap', 'runqsz_Not_CPU_Bound', 'scall_sq',
'pflt_sq'],
dtype='object')
In [67]:
X_test.columns
Out[67]:
Index(['const', 'lread', 'lwrite', 'scall', 'sread', 'swrite', 'fork', 'exec',
'rchar', 'wchar', 'pgout', 'ppgout', 'pgfree', 'pgscan', 'atch', 'pgin',
'ppgin', 'pflt', 'vflt', 'freemem', 'freeswap', 'runqsz_Not_CPU_Bound'],
dtype='object')
In [70]:
# dropping columns from the test data that are not there in the training data
X_test2 = X_test.drop(
["exec", "lwrite", "vflt", "swrite", "sread", "pgout", "ppgin", "ppgout", "pgscan", "fork"], axis=1
)
In [73]:
# transforming the weight column in the test data corresponding to the training set
X_test2["scall_sq"] = np.square(X_test2["scall"])
X_test2["pflt_sq"] = np.square(X_test2["pflt"])
In [74]:
# let's make predictions on the test set
y_pred_test = olsres_6.predict(X_test2)
y_pred_train = olsres_6.predict(X_train)
In [75]:
# To check model performance
from sklearn.metrics import mean_absolute_error, mean_squared_error
In [76]:
# let's check the RMSE on the train data
rmse1 = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse1
Out[76]:
4.349243018051866
In [77]:
# let's check the RMSE on the test data
rmse2 = np.sqrt(mean_squared_error(y_test, y_pred_test))
rmse2
Out[77]:
4.542132394160572
In [78]:
# let's check the MAE on the train data
mae1 = mean_absolute_error(y_train, df_pred["Fitted Values"])
mae1
Out[78]:
3.2083224632361858
In [79]:
# let's check the MAE on the test data
mae2 = mean_absolute_error(y_test, y_pred_test)
mae2
Out[79]:
3.260139643316443
Sci-Kit Learn Linear Regression¶
In [80]:
# invoke the LinearRegression function and find the bestfit model on training data
from sklearn.linear_model import LinearRegression
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
Out[80]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [81]:
# Let us explore the coefficients for each of the independent attributes
for idx, col_name in enumerate(X_train.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
The coefficient for const is 0.0 The coefficient for lread is -0.040608622458206595 The coefficient for scall is 0.0009109432268176952 The coefficient for rchar is -5.569104437094579e-06 The coefficient for wchar is -7.544671942038639e-06 The coefficient for pgfree is -0.10398102553937687 The coefficient for atch is 0.3076086608290831 The coefficient for pgin is -0.11481678843273727 The coefficient for pflt is -0.02712251982132653 The coefficient for freemem is -0.0002793470072352273 The coefficient for freeswap is 9.528154683609963e-06 The coefficient for runqsz_Not_CPU_Bound is 1.8712813400322006 The coefficient for scall_sq is -2.861326655126395e-07 The coefficient for pflt_sq is -6.018855684120802e-05
In [82]:
# Let us check the intercept for the model
intercept = regression_model.intercept_[0]
print("The intercept for our model is {}".format(intercept))
The intercept for our model is 80.32578269653469
In [83]:
# R square on training data
regression_model.score(X_train, y_train)
Out[83]:
0.8025428491193711
In [84]:
# R square on testing data
regression_model.score(X_test2, y_test)
Out[84]:
0.7786015387667776
In [85]:
#RMSE on Training data
from sklearn import metrics
predicted_train=regression_model.fit(X_train, y_train).predict(X_train)
np.sqrt(metrics.mean_squared_error(y_train,predicted_train))
Out[85]:
4.349243018051866
In [86]:
#RMSE on Testing data
predicted_test=regression_model.fit(X_train, y_train).predict(X_test2)
np.sqrt(metrics.mean_squared_error(y_test,predicted_test))
Out[86]:
4.542132394160525
In [ ]: